Update README.md
Browse files
README.md
CHANGED
|
@@ -188,7 +188,7 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
|
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 191 |
-
| mmlu |
|
| 192 |
|
| 193 |
|
| 194 |
<details>
|
|
@@ -219,7 +219,7 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
|
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 222 |
-
| Peak Memory (GB) |
|
| 223 |
|
| 224 |
|
| 225 |
|
|
@@ -279,7 +279,7 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 282 |
-
| latency (batch_size=1) |
|
| 283 |
|
| 284 |
<details>
|
| 285 |
<summary> Reproduce Model Performance Results </summary>
|
|
@@ -311,48 +311,6 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
| 311 |
export MODEL=jerryzh168/gemma-3-12b-it-INT4
|
| 312 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 313 |
```
|
| 314 |
-
|
| 315 |
-
## benchmark_serving
|
| 316 |
-
|
| 317 |
-
We benchmarked the throughput in a serving environment.
|
| 318 |
-
|
| 319 |
-
Download sharegpt dataset:
|
| 320 |
-
|
| 321 |
-
```Shell
|
| 322 |
-
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
| 323 |
-
```
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
Other datasets can be found in: https://github.com/vllm-project/vllm/tree/main/benchmarks
|
| 328 |
-
|
| 329 |
-
Note: you can change the number of prompts to be benchmarked with `--num-prompts` argument for `benchmark_serving` script.
|
| 330 |
-
|
| 331 |
-
### baseline
|
| 332 |
-
Server:
|
| 333 |
-
```Shell
|
| 334 |
-
export MODEL=google/gemma-3-12b-it
|
| 335 |
-
vllm serve $MODEL --tokenizer $MODEL -O3
|
| 336 |
-
```
|
| 337 |
-
|
| 338 |
-
Client:
|
| 339 |
-
```Shell
|
| 340 |
-
export MODEL=google/gemma-3-12b-it
|
| 341 |
-
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 342 |
-
```
|
| 343 |
-
|
| 344 |
-
### INT4
|
| 345 |
-
Server:
|
| 346 |
-
```Shell
|
| 347 |
-
export MODEL=jerryzh168/gemma-3-12b-it-INT4
|
| 348 |
-
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
Client:
|
| 352 |
-
```Shell
|
| 353 |
-
export MODEL=jerryzh168/gemma-3-12b-it-INT4
|
| 354 |
-
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 355 |
-
```
|
| 356 |
</details>
|
| 357 |
|
| 358 |
|
|
|
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 191 |
+
| mmlu | 71.51 | 68.96 |
|
| 192 |
|
| 193 |
|
| 194 |
<details>
|
|
|
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 222 |
+
| Peak Memory (GB) | 24.50 | 8.68 (65% reduction) |
|
| 223 |
|
| 224 |
|
| 225 |
|
|
|
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-INT4 |
|
| 282 |
+
| latency (batch_size=1) | 3.73s | 2.16s (1.73x speedup) |
|
| 283 |
|
| 284 |
<details>
|
| 285 |
<summary> Reproduce Model Performance Results </summary>
|
|
|
|
| 311 |
export MODEL=jerryzh168/gemma-3-12b-it-INT4
|
| 312 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 313 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
</details>
|
| 315 |
|
| 316 |
|