WayneJin0918 commited on
Commit
b74f5b1
·
1 Parent(s): 71139cc

Initial commit: add app.py and requirements

Browse files
Files changed (1) hide show
  1. app.py +12 -0
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import numpy as np
3
  import os
4
  import torch
 
5
  import random
6
  import subprocess
7
  # Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
@@ -96,6 +97,15 @@ model = load_checkpoint_and_dispatch(
96
  force_hooks=True,
97
  ).eval()
98
 
 
 
 
 
 
 
 
 
 
99
 
100
  # --- Inferencer Preparing ---
101
  inferencer = InterleaveInferencer(
@@ -433,4 +443,6 @@ with gr.Blocks() as demo:
433
  outputs=txt_output
434
  )
435
 
 
436
  demo.launch(share=True)
 
 
2
  import numpy as np
3
  import os
4
  import torch
5
+ import torch.quantization
6
  import random
7
  import subprocess
8
  # Install flash-attn without CUDA extensions, which is suitable for CPU-only environments.
 
97
  force_hooks=True,
98
  ).eval()
99
 
100
+ # --- INT8 Quantization ---
101
+ # Apply dynamic quantization to the language model component for CPU optimization.
102
+ # This converts the linear layer weights to int8, reducing memory and speeding up inference.
103
+ print("Applying INT8 dynamic quantization to the language model...")
104
+ model.language_model = torch.quantization.quantize_dynamic(
105
+ model.language_model, {torch.nn.Linear}, dtype=torch.qint8
106
+ )
107
+ print("Quantization complete.")
108
+
109
 
110
  # --- Inferencer Preparing ---
111
  inferencer = InterleaveInferencer(
 
443
  outputs=txt_output
444
  )
445
 
446
+
447
  demo.launch(share=True)
448
+