Spaces:

Wayne-King
/

SRUM_Bagel_MoT-7B

Runtime error

WayneJin0918 commited on Oct 15

Commit

b74f5b1

1 Parent(s): 71139cc

Initial commit: add app.py and requirements

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import numpy as np
 import os
 import torch
 import random
 import subprocess
 # Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
@@ -96,6 +97,15 @@ model = load_checkpoint_and_dispatch(
     force_hooks=True,
 ).eval()
 # --- Inferencer Preparing ---
 inferencer = InterleaveInferencer(
@@ -433,4 +443,6 @@ with gr.Blocks() as demo:
             outputs=txt_output
         )
 demo.launch(share=True)

 import numpy as np
 import os
 import torch
+import torch.quantization
 import random
 import subprocess
 # Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
     force_hooks=True,
 ).eval()
+# --- INT8 Quantization ---
+# Apply dynamic quantization to the language model component for CPU optimization.
+# This converts the linear layer weights to int8, reducing memory and speeding up inference.
+print("Applying INT8 dynamic quantization to the language model...")
+model.language_model = torch.quantization.quantize_dynamic(
+    model.language_model, {torch.nn.Linear}, dtype=torch.qint8
+)
+print("Quantization complete.")
 # --- Inferencer Preparing ---
 inferencer = InterleaveInferencer(
             outputs=txt_output
         )
 demo.launch(share=True)