TotoB12 commited on
Commit
e34a9ed
·
verified ·
1 Parent(s): e4c779d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -27
app.py CHANGED
@@ -3,8 +3,8 @@ from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
 
5
  # Define model details
6
- MODEL_REPO = "TotoB12/totob-1.5B" # You can swap this for Mistral-7B or another GGUF model
7
- MODEL_FILE = "totob-1.5B.gguf" # 4-bit quantized model file
8
 
9
  # Download the quantized model from Hugging Face
10
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
@@ -12,15 +12,49 @@ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
12
  # Load the model with llama.cpp for CPU-only inference
13
  llm = Llama(
14
  model_path=model_path,
15
- n_gpu_layers=0, # Set to 0 for CPU-only
16
- n_threads=4, # Adjust based on CPU cores (e.g., 4 for quad-core)
17
  n_batch=512, # Batch size for inference
18
- n_ctx=2048, # Context length (adjust based on RAM; 2048 fits ~16 GB)
19
  verbose=False # Reduce logging for cleaner output
20
  )
21
 
22
- # Define the inference function
23
- def generate_text(prompt, max_tokens=256, temperature=0.8, top_p=0.95):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
  output = llm(
26
  prompt,
@@ -29,25 +63,43 @@ def generate_text(prompt, max_tokens=256, temperature=0.8, top_p=0.95):
29
  top_p=top_p,
30
  repeat_penalty=1.1
31
  )
32
- return output["choices"][0]["text"].strip()
33
  except Exception as e:
34
- return f"Error: {str(e)}"
35
-
36
- # Create Gradio interface
37
- interface = gr.Interface(
38
- fn=generate_text,
39
- inputs=[
40
- gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
41
- gr.Slider(label="Max Tokens", minimum=50, maximum=512, value=256, step=10),
42
- gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1),
43
- gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.95, step=0.05)
44
- ],
45
- outputs=gr.Textbox(label="Generated Text"),
46
- title="Quantized LLM on Hugging Face Spaces",
47
- description="Run a 4-bit quantized Vicuna-13B model on CPU using llama.cpp",
48
- theme="default"
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Launch the app
52
- if __name__ == "__main__":
53
- interface.launch(server_name="0.0.0.0", server_port=7860)
 
3
  from llama_cpp import Llama
4
 
5
  # Define model details
6
+ MODEL_REPO = "TotoB12/totob-1.5B" # Change to your desired repository/model
7
+ MODEL_FILE = "totob-1.5B.gguf" # 4-bit quantized model file
8
 
9
  # Download the quantized model from Hugging Face
10
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 
12
  # Load the model with llama.cpp for CPU-only inference
13
  llm = Llama(
14
  model_path=model_path,
15
+ n_gpu_layers=0, # CPU-only
16
+ n_threads=4, # Adjust based on CPU cores
17
  n_batch=512, # Batch size for inference
18
+ n_ctx=2048, # Context length (adjust based on RAM)
19
  verbose=False # Reduce logging for cleaner output
20
  )
21
 
22
+ def build_prompt(messages, bos_token=""):
23
+ """
24
+ Build a single prompt string from the conversation history.
25
+ This function mimics your Jinja template formatting by including:
26
+ - A system prompt (if any)
27
+ - Each user message prefixed with <|User|>
28
+ - Each assistant message prefixed with <|Assistant|> and ended with <|end▁of▁sentence|>
29
+ Finally, it appends an <|Assistant|> token to signal the model to generate.
30
+ """
31
+ system_prompt = ""
32
+ # Use the content of any system message as the system prompt.
33
+ for msg in messages:
34
+ if msg['role'] == "system":
35
+ system_prompt = msg['content']
36
+ prompt = bos_token + system_prompt
37
+ # Format each message in the conversation.
38
+ for msg in messages:
39
+ if msg['role'] == "user":
40
+ prompt += "<|User|>" + msg['content']
41
+ elif msg['role'] == "assistant":
42
+ prompt += "<|Assistant|>" + msg['content'] + "<|end▁of▁sentence|>"
43
+ # Signal that the assistant should generate the next part.
44
+ prompt += "<|Assistant|>"
45
+ return prompt
46
+
47
+ def chat(user_input, history, max_tokens=256, temperature=0.8, top_p=0.95):
48
+ """
49
+ The chat function appends the new user message, builds the chat prompt,
50
+ generates the assistant response, and returns the updated conversation.
51
+ """
52
+ if history is None:
53
+ history = []
54
+ # Add the new user message to the conversation history.
55
+ history.append({"role": "user", "content": user_input})
56
+ # Build the complete prompt from history.
57
+ prompt = build_prompt(history)
58
  try:
59
  output = llm(
60
  prompt,
 
63
  top_p=top_p,
64
  repeat_penalty=1.1
65
  )
66
+ assistant_response = output["choices"][0]["text"].strip()
67
  except Exception as e:
68
+ assistant_response = f"Error: {str(e)}"
69
+ # Append the assistant's response.
70
+ history.append({"role": "assistant", "content": assistant_response})
71
+
72
+ # Prepare a display-friendly chat history as pairs for Gradio's Chatbot.
73
+ chat_history = []
74
+ i = 0
75
+ while i < len(history):
76
+ if history[i]['role'] == "user":
77
+ user_msg = history[i]['content']
78
+ assistant_msg = ""
79
+ if i+1 < len(history) and history[i+1]['role'] == "assistant":
80
+ assistant_msg = history[i+1]['content']
81
+ i += 2
82
+ else:
83
+ i += 1
84
+ chat_history.append((user_msg, assistant_msg))
85
+ else:
86
+ i += 1
87
+
88
+ return chat_history, history
89
+
90
+ # Build the Gradio interface using Blocks
91
+ with gr.Blocks() as demo:
92
+ gr.Markdown("# Chat with Quantized LLM on CPU")
93
+ chatbot = gr.Chatbot()
94
+ # This state variable will hold the conversation history as a list of dicts.
95
+ state = gr.State([])
96
+ with gr.Row():
97
+ txt = gr.Textbox(show_label=False, placeholder="Enter your message and press Enter").style(container=False)
98
+ with gr.Row():
99
+ max_tokens_slider = gr.Slider(50, 512, value=256, step=10, label="Max Tokens")
100
+ temperature_slider = gr.Slider(0.1, 2.0, value=0.8, step=0.1, label="Temperature")
101
+ top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
102
+ # When the user submits a message, update both the chatbot display and the state.
103
+ txt.submit(chat, [txt, state, max_tokens_slider, temperature_slider, top_p_slider], [chatbot, state])
104
 
105
+ demo.launch(server_name="0.0.0.0", server_port=7860)