Generation
Requires: https://github.com/vllm-project/llm-compressor/pull/1788
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
# Configure the quantization algorithm to run.
recipe = [
AWQModifier(
duo_scaling=False,
ignore=[
"lm_head",
"re:.*mlp.gate$",
"re:.*mlp.shared_expert_gate$",
"re:visual.*",
],
scheme="W4A16",
targets=["Linear"],
),
]
# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"
# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048
def get_calib_dataset(tokenizer):
from datasets import load_dataset
ds = load_dataset(
DATASET_ID,
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
)
def preprocess(example):
chat_messages = [
{"role": "user", "content": example["instruction"].strip()},
{"role": "assistant", "content": example["output"].strip()},
]
tokenized_messages = tokenizer.apply_chat_template(
chat_messages, tokenize=True
)
return {"input_ids": tokenized_messages}
ds = (
ds.shuffle(seed=42)
.map(preprocess, remove_columns=ds.column_names)
.select(range(NUM_CALIBRATION_SAMPLES))
)
return ds
if __name__ == "__main__":
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype="auto", trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
###
### Apply algorithms.
###
oneshot(
model=model,
dataset=get_calib_dataset(tokenizer),
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
log_dir=None,
trust_remote_code_model=True,
)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
Evaluation
The model was evaluated on HumanEval and HumanEval+ benchmark with the Neural Magic fork of the EvalPlus implementation of HumanEval+ and the vLLM engine, using the following commands:
python evalplus/codegen/generate.py --model nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq --bs 16 --temperature 0.2 --n_samples 50 --root "./results" --dataset humaneval --backend vllm --dtype auto
python evalplus/evalplus/sanitize.py results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2
evalplus.evaluate --dataset humaneval --samples results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2-sanitized
| Metric | Qwen/Qwen3-Coder-30B-A3B-Instruct | nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq |
|---|---|---|
| HumanEval pass@1 | 93.0 | 93.7 |
| HumanEval pass@10 | 93.9 | 94.5 |
| HumanEval+ pass@1 | 88.7 | 89.3 |
| HumanEval+ pass@10 | 89.8 | 90.2 |
| Average Score | 91.35 | 91.93 |
- Downloads last month
- 599
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support
Model tree for nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq
Base model
Qwen/Qwen3-Coder-30B-A3B-Instruct