totob-1.5B_chat

Sleeping

App Files Files Community

TotoB12 commited on Apr 7

Commit

e34a9ed

verified ·

1 Parent(s): e4c779d

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -27

app.py CHANGED Viewed

@@ -3,8 +3,8 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # Define model details
-MODEL_REPO = "TotoB12/totob-1.5B"  # You can swap this for Mistral-7B or another GGUF model
-MODEL_FILE = "totob-1.5B.gguf"    # 4-bit quantized model file
 # Download the quantized model from Hugging Face
 model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
@@ -12,15 +12,49 @@ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 # Load the model with llama.cpp for CPU-only inference
 llm = Llama(
     model_path=model_path,
-    n_gpu_layers=0,        # Set to 0 for CPU-only
-    n_threads=4,           # Adjust based on CPU cores (e.g., 4 for quad-core)
     n_batch=512,           # Batch size for inference
-    n_ctx=2048,            # Context length (adjust based on RAM; 2048 fits ~16 GB)
     verbose=False          # Reduce logging for cleaner output
 )
-# Define the inference function
-def generate_text(prompt, max_tokens=256, temperature=0.8, top_p=0.95):
     try:
         output = llm(
             prompt,
@@ -29,25 +63,43 @@ def generate_text(prompt, max_tokens=256, temperature=0.8, top_p=0.95):
             top_p=top_p,
             repeat_penalty=1.1
         )
-        return output["choices"][0]["text"].strip()
     except Exception as e:
-        return f"Error: {str(e)}"
-# Create Gradio interface
-interface = gr.Interface(
-    fn=generate_text,
-    inputs=[
-        gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
-        gr.Slider(label="Max Tokens", minimum=50, maximum=512, value=256, step=10),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1),
-        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.95, step=0.05)
-    ],
-    outputs=gr.Textbox(label="Generated Text"),
-    title="Quantized LLM on Hugging Face Spaces",
-    description="Run a 4-bit quantized Vicuna-13B model on CPU using llama.cpp",
-    theme="default"
-)
-# Launch the app
-if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860)

 from llama_cpp import Llama
 # Define model details
+MODEL_REPO = "TotoB12/totob-1.5B"  # Change to your desired repository/model
+MODEL_FILE = "totob-1.5B.gguf"      # 4-bit quantized model file
 # Download the quantized model from Hugging Face
 model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 # Load the model with llama.cpp for CPU-only inference
 llm = Llama(
     model_path=model_path,
+    n_gpu_layers=0,        # CPU-only
+    n_threads=4,           # Adjust based on CPU cores
     n_batch=512,           # Batch size for inference
+    n_ctx=2048,            # Context length (adjust based on RAM)
     verbose=False          # Reduce logging for cleaner output
 )
+def build_prompt(messages, bos_token=""):
+    """
+    Build a single prompt string from the conversation history.
+    This function mimics your Jinja template formatting by including:
+    - A system prompt (if any)
+    - Each user message prefixed with <｜User｜>
+    - Each assistant message prefixed with <｜Assistant｜> and ended with <｜end▁of▁sentence｜>
+    Finally, it appends an <｜Assistant｜> token to signal the model to generate.
+    """
+    system_prompt = ""
+    # Use the content of any system message as the system prompt.
+    for msg in messages:
+        if msg['role'] == "system":
+            system_prompt = msg['content']
+    prompt = bos_token + system_prompt
+    # Format each message in the conversation.
+    for msg in messages:
+        if msg['role'] == "user":
+            prompt += "<｜User｜>" + msg['content']
+        elif msg['role'] == "assistant":
+            prompt += "<｜Assistant｜>" + msg['content'] + "<｜end▁of▁sentence｜>"
+    # Signal that the assistant should generate the next part.
+    prompt += "<｜Assistant｜>"
+    return prompt
+def chat(user_input, history, max_tokens=256, temperature=0.8, top_p=0.95):
+    """
+    The chat function appends the new user message, builds the chat prompt,
+    generates the assistant response, and returns the updated conversation.
+    """
+    if history is None:
+        history = []
+    # Add the new user message to the conversation history.
+    history.append({"role": "user", "content": user_input})
+    # Build the complete prompt from history.
+    prompt = build_prompt(history)
     try:
         output = llm(
             prompt,
             top_p=top_p,
             repeat_penalty=1.1
         )
+        assistant_response = output["choices"][0]["text"].strip()
     except Exception as e:
+        assistant_response = f"Error: {str(e)}"
+    # Append the assistant's response.
+    history.append({"role": "assistant", "content": assistant_response})
+    # Prepare a display-friendly chat history as pairs for Gradio's Chatbot.
+    chat_history = []
+    i = 0
+    while i < len(history):
+        if history[i]['role'] == "user":
+            user_msg = history[i]['content']
+            assistant_msg = ""
+            if i+1 < len(history) and history[i+1]['role'] == "assistant":
+                assistant_msg = history[i+1]['content']
+                i += 2
+            else:
+                i += 1
+            chat_history.append((user_msg, assistant_msg))
+        else:
+            i += 1
+    return chat_history, history
+# Build the Gradio interface using Blocks
+with gr.Blocks() as demo:
+    gr.Markdown("# Chat with Quantized LLM on CPU")
+    chatbot = gr.Chatbot()
+    # This state variable will hold the conversation history as a list of dicts.
+    state = gr.State([])
+    with gr.Row():
+        txt = gr.Textbox(show_label=False, placeholder="Enter your message and press Enter").style(container=False)
+    with gr.Row():
+        max_tokens_slider = gr.Slider(50, 512, value=256, step=10, label="Max Tokens")
+        temperature_slider = gr.Slider(0.1, 2.0, value=0.8, step=0.1, label="Temperature")
+        top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
+    # When the user submits a message, update both the chatbot display and the state.
+    txt.submit(chat, [txt, state, max_tokens_slider, temperature_slider, top_p_slider], [chatbot, state])
+demo.launch(server_name="0.0.0.0", server_port=7860)