Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -2,4 +2,41 @@ | |
| 2 | 
             
            license: apache-2.0
         | 
| 3 | 
             
            ---
         | 
| 4 | 
             
            This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop.
         | 
| 5 | 
            -
            https://github.com/aws-neuron/neuron-workshops
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 2 | 
             
            license: apache-2.0
         | 
| 3 | 
             
            ---
         | 
| 4 | 
             
            This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop.
         | 
| 5 | 
            +
            https://github.com/aws-neuron/neuron-workshops
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            This checkpoint was generated with the code:
         | 
| 8 | 
            +
            ```
         | 
| 9 | 
            +
            bs=1
         | 
| 10 | 
            +
            seqlength=1024
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            import os
         | 
| 13 | 
            +
            from vllm import LLM, SamplingParams
         | 
| 14 | 
            +
            os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}"
         | 
| 17 | 
            +
            #save the sharded weights and compiler artifacts in the same folder
         | 
| 18 | 
            +
            os.environ['NEURON_COMPILED_ARTIFACTS'] = path
         | 
| 19 | 
            +
            os.environ['BASE_COMPILE_WORK_DIR'] =path
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            llm = LLM(
         | 
| 22 | 
            +
                model="/home/ubuntu/models/Qwen3-8B",
         | 
| 23 | 
            +
                max_num_seqs=bs,
         | 
| 24 | 
            +
                max_model_len=seqlength,
         | 
| 25 | 
            +
                device="neuron",
         | 
| 26 | 
            +
                tensor_parallel_size=2,
         | 
| 27 | 
            +
                override_neuron_config={"save_sharded_checkpoint": True})
         | 
| 28 | 
            +
            prompts = [
         | 
| 29 | 
            +
                "Hello, my name is",
         | 
| 30 | 
            +
                "The president of the United States is",
         | 
| 31 | 
            +
                "The capital of France is",
         | 
| 32 | 
            +
                "The future of AI is",
         | 
| 33 | 
            +
            ]
         | 
| 34 | 
            +
            # note that top_k must be set to lower than the global_top_k defined in
         | 
| 35 | 
            +
            # the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig
         | 
| 36 | 
            +
            sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)
         | 
| 37 | 
            +
            outputs = llm.generate(prompts, sampling_params)
         | 
| 38 | 
            +
            for output in outputs:
         | 
| 39 | 
            +
                prompt = output.prompt
         | 
| 40 | 
            +
                generated_text = output.outputs[0].text
         | 
| 41 | 
            +
                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         | 
| 42 | 
            +
            ```
         | 

