K00B404 commited on
Commit
606e387
·
verified ·
1 Parent(s): c74d3e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
2
+ import torch
3
+ import os
4
+ import signal
5
+
6
+ cpu_count = os.cpu_count()
7
+ print(f"Number of CPU cores in the system: {cpu_count}")
8
+ half_cpu_count = cpu_count // 2
9
+ os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
10
+ os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
11
+ torch.set_num_threads(half_cpu_count)
12
+
13
+ print(f"PyTorch threads: {torch.get_num_threads()}")
14
+ print(f"MKL threads: {os.getenv('MKL_NUM_THREADS')}")
15
+ print(f"OMP threads: {os.getenv('OMP_NUM_THREADS')}")
16
+
17
+ # Load the model and tokenizer
18
+ NEW_MODEL_ID = "huihui-ai/Qwen3-8B-abliterated"
19
+ print(f"Load Model {NEW_MODEL_ID} ... ")
20
+ quant_config_4 = BitsAndBytesConfig(
21
+ load_in_4bit=True,
22
+ bnb_4bit_compute_dtype=torch.bfloat16,
23
+ bnb_4bit_use_double_quant=True,
24
+ llm_int8_enable_fp32_cpu_offload=True,
25
+ )
26
+
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ NEW_MODEL_ID,
29
+ device_map="auto",
30
+ trust_remote_code=True,
31
+ #quantization_config=quant_config_4,
32
+ torch_dtype=torch.bfloat16
33
+ )
34
+ tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL_ID, trust_remote_code=True)
35
+ if tokenizer.pad_token is None:
36
+ tokenizer.pad_token = tokenizer.eos_token
37
+ tokenizer.pad_token_id = tokenizer.eos_token_id
38
+
39
+ initial_messages = [{"role": "system", "content": "You are a helpful assistant."}]
40
+ messages = initial_messages.copy()
41
+ enable_thinking = True
42
+ skip_prompt=True
43
+ skip_special_tokens=True
44
+
45
+ class CustomTextStreamer(TextStreamer):
46
+ def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
47
+ super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
48
+ self.generated_text = ""
49
+ self.stop_flag = False
50
+
51
+ def on_finalized_text(self, text: str, stream_end: bool = False):
52
+ self.generated_text += text
53
+ print(text, end="", flush=True)
54
+ if self.stop_flag:
55
+ raise StopIteration
56
+
57
+ def stop_generation(self):
58
+ self.stop_flag = True
59
+
60
+ def generate_stream(model, tokenizer, messages, enable_thinking, skip_prompt, skip_special_tokens, max_new_tokens):
61
+ input_ids = tokenizer.apply_chat_template(
62
+ messages,
63
+ tokenize=True,
64
+ enable_thinking = enable_thinking,
65
+ add_generation_prompt=True,
66
+ return_tensors="pt"
67
+ )
68
+ attention_mask = torch.ones_like(input_ids, dtype=torch.long)
69
+ tokens = input_ids.to(model.device)
70
+ attention_mask = attention_mask.to(model.device)
71
+
72
+ streamer = CustomTextStreamer(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
73
+
74
+ def signal_handler(sig, frame):
75
+ streamer.stop_generation()
76
+ print("\n[Generation stopped by user with Ctrl+C]")
77
+
78
+ signal.signal(signal.SIGINT, signal_handler)
79
+
80
+ print("Response: ", end="", flush=True)
81
+ try:
82
+ generated_ids = model.generate(
83
+ tokens,
84
+ attention_mask=attention_mask,
85
+ use_cache=False,
86
+ max_new_tokens=max_new_tokens,
87
+ do_sample=True,
88
+ pad_token_id=tokenizer.pad_token_id,
89
+ streamer=streamer
90
+ )
91
+ del generated_ids
92
+ except StopIteration:
93
+ print("\n[Stopped by user]")
94
+
95
+ del input_ids, attention_mask
96
+ torch.cuda.empty_cache()
97
+ signal.signal(signal.SIGINT, signal.SIG_DFL)
98
+
99
+ return streamer.generated_text, streamer.stop_flag
100
+
101
+ while True:
102
+ user_input = input("User: ").strip()
103
+ if user_input.lower() == "/exit":
104
+ print("Exiting chat.")
105
+ break
106
+ if user_input.lower() == "/clear":
107
+ messages = initial_messages.copy()
108
+ print("Chat history cleared. Starting a new conversation.")
109
+ continue
110
+ if user_input.lower() == "/no_think":
111
+ if enable_thinking:
112
+ enable_thinking = False
113
+ print("Thinking = False.")
114
+ else:
115
+ enable_thinking = True
116
+ print("Thinking = True.")
117
+ continue
118
+ if user_input.lower() == "/skip_prompt":
119
+ if skip_prompt:
120
+ skip_prompt = False
121
+ print("skip_prompt = False.")
122
+ else:
123
+ skip_prompt = True
124
+ print("skip_prompt = True.")
125
+ continue
126
+ if user_input.lower() == "/skip_special_tokens":
127
+ if skip_special_tokens:
128
+ skip_special_tokens = False
129
+ print("skip_special_tokens = False.")
130
+ else:
131
+ skip_special_tokens = True
132
+ print("skip_special_tokens = True.")
133
+ continue
134
+ if not user_input:
135
+ print("Input cannot be empty. Please enter something.")
136
+ continue
137
+ messages.append({"role": "user", "content": user_input})
138
+ response, stop_flag = generate_stream(model, tokenizer, messages, enable_thinking, skip_prompt, skip_special_tokens, 8192)
139
+ print("", flush=True)
140
+ if stop_flag:
141
+ continue
142
+ messages.append({"role": "assistant", "content": response})