Spaces:
Runtime error
Runtime error
WayneJin0918
commited on
Commit
·
b74f5b1
1
Parent(s):
71139cc
Initial commit: add app.py and requirements
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
import torch
|
|
|
|
| 5 |
import random
|
| 6 |
import subprocess
|
| 7 |
# Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
|
|
@@ -96,6 +97,15 @@ model = load_checkpoint_and_dispatch(
|
|
| 96 |
force_hooks=True,
|
| 97 |
).eval()
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# --- Inferencer Preparing ---
|
| 101 |
inferencer = InterleaveInferencer(
|
|
@@ -433,4 +443,6 @@ with gr.Blocks() as demo:
|
|
| 433 |
outputs=txt_output
|
| 434 |
)
|
| 435 |
|
|
|
|
| 436 |
demo.launch(share=True)
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
+
import torch.quantization
|
| 6 |
import random
|
| 7 |
import subprocess
|
| 8 |
# Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
|
|
|
|
| 97 |
force_hooks=True,
|
| 98 |
).eval()
|
| 99 |
|
| 100 |
+
# --- INT8 Quantization ---
|
| 101 |
+
# Apply dynamic quantization to the language model component for CPU optimization.
|
| 102 |
+
# This converts the linear layer weights to int8, reducing memory and speeding up inference.
|
| 103 |
+
print("Applying INT8 dynamic quantization to the language model...")
|
| 104 |
+
model.language_model = torch.quantization.quantize_dynamic(
|
| 105 |
+
model.language_model, {torch.nn.Linear}, dtype=torch.qint8
|
| 106 |
+
)
|
| 107 |
+
print("Quantization complete.")
|
| 108 |
+
|
| 109 |
|
| 110 |
# --- Inferencer Preparing ---
|
| 111 |
inferencer = InterleaveInferencer(
|
|
|
|
| 443 |
outputs=txt_output
|
| 444 |
)
|
| 445 |
|
| 446 |
+
|
| 447 |
demo.launch(share=True)
|
| 448 |
+
|