hfendpoints-images
/

text-generation-sglang-gpu

Morgan Funtowicz commited on May 2

Commit

57ba236

1 Parent(s): 36406e7

feat(quant): allow using native precision

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -3,7 +3,7 @@ FROM lmsysorg/sglang:latest
 ENV MODEL_ID="/repository"
 ENV KV_CACHE_DTYPE="auto"
 ENV TP_SIZE="1"
-ENV QUANT_METHOD="w8a8_int8"
 EXPOSE 80
 COPY entrypoint.sh /usr/local/endpoint/

 ENV MODEL_ID="/repository"
 ENV KV_CACHE_DTYPE="auto"
 ENV TP_SIZE="1"
+ENV QUANT_METHOD=""
 EXPOSE 80
 COPY entrypoint.sh /usr/local/endpoint/

entrypoint.sh CHANGED Viewed

@@ -1,13 +1,28 @@
 #!/bin/bash
-python3 -m sglang.launch_server \
-  --model-path $MODEL_ID \
-  --kv-cache-dtype $KV_CACHE_DTYPE \
-  --tensor-parallel-size $TP_SIZE \
-  --expert-parallel-size $TP_SIZE \
-  --quantization $QUANT_METHOD \
-  --enable-torch-compile \
-  --enable-ep-moe \
-  --tool-call-parser qwen25 \
-  --host 0.0.0.0 \
-  --port 80

 #!/bin/bash
+if [ -z "$QUANT_METHOD" ] then
+  echo "Using native precision"
+  python3 -m sglang.launch_server \
+    --model-path $MODEL_ID \
+    --kv-cache-dtype $KV_CACHE_DTYPE \
+    --tensor-parallel-size $TP_SIZE \
+    --expert-parallel-size $TP_SIZE \
+    --enable-torch-compile \
+    --enable-ep-moe \
+    --tool-call-parser qwen25 \
+    --host 0.0.0.0 \
+    --port 80
+else
+  echo "Using ${QUANT_METHOD} quantization schema"
+  python3 -m sglang.launch_server \
+    --model-path $MODEL_ID \
+    --kv-cache-dtype $KV_CACHE_DTYPE \
+    --tensor-parallel-size $TP_SIZE \
+    --expert-parallel-size $TP_SIZE \
+    --quantization $QUANT_METHOD \
+    --enable-torch-compile \
+    --enable-ep-moe \
+    --tool-call-parser qwen25 \
+    --host 0.0.0.0 \
+    --port 80
+fi