--- license: apache-2.0 --- This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop. https://github.com/aws-neuron/neuron-workshops This checkpoint was generated with the code: ``` bs=1 seqlength=1024 import os from vllm import LLM, SamplingParams os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference" path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}" #save the sharded weights and compiler artifacts in the same folder os.environ['NEURON_COMPILED_ARTIFACTS'] = path os.environ['BASE_COMPILE_WORK_DIR'] =path llm = LLM( model="/home/ubuntu/models/Qwen3-8B", max_num_seqs=bs, max_model_len=seqlength, device="neuron", tensor_parallel_size=2, override_neuron_config={"save_sharded_checkpoint": True}) prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # note that top_k must be set to lower than the global_top_k defined in # the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ```