import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import re import json from typing import List, Dict, Any, Optional import logging import spaces import os # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft" # Main repo for config and chat template INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4" # Int4 quantized model LOCAL_MODEL_PATH = "./int4" # Local int4 weights DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Global variables for model and tokenizer model = None tokenizer = None # Default system prompt DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant." # Title and description content title = "# ๐Ÿค– Petite Elle L'Aime 3 - Chat Interface" description = "A fine-tuned version of SmolLM3-3B optimized for French and multilingual conversations. This is the int4 quantized version for efficient CPU deployment." presentation1 = """ ### ๐ŸŽฏ Features - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic - **Int4 Quantization**: Optimized for CPU deployment with ~50% memory reduction - **Interactive Chat Interface**: Real-time conversation with the model - **Customizable System Prompt**: Define the assistant's personality and behavior - **Thinking Mode**: Enable reasoning mode with thinking tags """ presentation2 = """ ### ๐Ÿ“‹ Model Information - **Base Model**: SmolLM3-3B - **Parameters**: ~3B - **Context Length**: 128k - **Languages**: English, French, Italian, Portuguese, Chinese, Arabic - **Device**: CPU optimized - **Quantization**: int4 """ joinus = """ ### ๐Ÿš€ Quick Start 1. Add context in the system prompt 2. Type your message 3. Click generate to start chatting 4. Use advanced settings for fine-tuning """ def check_local_model(): """Check if local int4 model files exist""" required_files = [ "config.json", "pytorch_model.bin", "tokenizer.json", "tokenizer_config.json" ] for file in required_files: file_path = os.path.join(LOCAL_MODEL_PATH, file) if not os.path.exists(file_path): logger.warning(f"Missing required file: {file_path}") return False logger.info("All required model files found locally") return True def load_model(): """Load the model and tokenizer""" global model, tokenizer try: # Check if local model exists (downloaded during build) if check_local_model(): logger.info(f"Loading tokenizer from {LOCAL_MODEL_PATH}") tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH) logger.info(f"Loading int4 model from {LOCAL_MODEL_PATH}") model = AutoModelForCausalLM.from_pretrained( LOCAL_MODEL_PATH, device_map="auto" if DEVICE == "cuda" else "cpu", torch_dtype=torch.bfloat16, trust_remote_code=True ) else: logger.info(f"Local model not found, loading from {MAIN_MODEL_ID}") # Load tokenizer from main repo (for chat template and config) tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID) logger.info(f"Loading int4 model from {INT4_MODEL_ID}") # Load model with int4 quantization from Hugging Face model = AutoModelForCausalLM.from_pretrained( INT4_MODEL_ID, device_map="auto" if DEVICE == "cuda" else "cpu", torch_dtype=torch.bfloat16, trust_remote_code=True ) # Set pad token if not present if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id logger.info("Model loaded successfully") return True except Exception as e: logger.error(f"Error loading model: {e}") return False def create_prompt(system_message, user_message, enable_thinking=True): """Create prompt using the model's chat template""" try: # Prepare messages for the template formatted_messages = [] # Add system message if provided if system_message and system_message.strip(): formatted_messages.append({"role": "system", "content": system_message}) # Add user message formatted_messages.append({"role": "user", "content": user_message}) # Apply the chat template prompt = tokenizer.apply_chat_template( formatted_messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking ) # Add /no_think to the end of prompt when thinking is disabled if not enable_thinking: prompt += " /no_think" return prompt except Exception as e: logger.error(f"Error creating prompt: {e}") return "" @spaces.GPU(duration=94) def generate_response(message, history, system_message, max_tokens, temperature, top_p, do_sample, enable_thinking=True): """Generate response using the model""" global model, tokenizer if model is None or tokenizer is None: return "Error: Model not loaded. Please wait for the model to load." try: # Create prompt using chat template full_prompt = create_prompt(system_message, message, enable_thinking) if not full_prompt: return "Error: Failed to create prompt." # Tokenize the input inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True) # Move to device if DEVICE == "cuda": inputs = {k: v.cuda() for k, v in inputs.items()} # Generate response with torch.no_grad(): output_ids = model.generate( inputs['input_ids'], max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, attention_mask=inputs['attention_mask'], pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode the response response = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Extract only the new response (remove the input prompt) assistant_response = response[len(full_prompt):].strip() # Clean up the response - only remove special tokens, preserve thinking tags when enabled assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL) # Only remove thinking tags if thinking mode is disabled if not enable_thinking: assistant_response = re.sub(r'.*?', '', assistant_response, flags=re.DOTALL) assistant_response = assistant_response.strip() return assistant_response except Exception as e: logger.error(f"Error generating response: {e}") return f"Error generating response: {str(e)}" def user(user_message, history): """Add user message to history""" return "", history + [[user_message, None]] def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking): """Generate bot response""" user_message = history[-1][0] do_sample = advanced_checkbox bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking) history[-1][1] = bot_message return history # Load model on startup logger.info("Starting model loading process...") load_model() # Create Gradio interface with gr.Blocks() as demo: with gr.Row(): gr.Markdown(title) with gr.Row(): gr.Markdown(description) with gr.Row(): with gr.Column(scale=1): with gr.Group(): gr.Markdown(presentation1) with gr.Column(scale=1): with gr.Group(): gr.Markdown(presentation2) with gr.Row(): with gr.Column(scale=1): with gr.Group(): gr.Markdown(joinus) with gr.Column(scale=1): pass # Empty column for balance with gr.Row(): with gr.Column(scale=2): system_prompt = gr.TextArea( label="๐Ÿ“‘ Context", placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.", lines=5, value=DEFAULT_SYSTEM_PROMPT ) user_input = gr.TextArea( label="๐Ÿคท๐Ÿปโ€โ™‚๏ธ User Input", placeholder="Hi there my name is Tonic!", lines=2 ) advanced_checkbox = gr.Checkbox(label="๐Ÿงช Advanced Settings", value=False) with gr.Column(visible=False) as advanced_settings: max_length = gr.Slider( label="๐Ÿ“ Max Length", minimum=64, maximum=2048, value=512, step=64 ) temperature = gr.Slider( label="๐ŸŒก๏ธ Temperature", minimum=0.01, maximum=1.0, value=0.7, step=0.01 ) top_p = gr.Slider( label="โš›๏ธ Top-p (Nucleus Sampling)", minimum=0.1, maximum=1.0, value=0.9, step=0.01 ) enable_thinking = gr.Checkbox(label="Enable Thinking Mode", value=True) generate_button = gr.Button(value="๐Ÿค– Petite Elle L'Aime 3") with gr.Column(scale=2): chatbot = gr.Chatbot(label="๐Ÿค– Petite Elle L'Aime 3") generate_button.click( user, [user_input, chatbot], [user_input, chatbot], queue=False ).then( bot, [chatbot, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking], chatbot ) advanced_checkbox.change( fn=lambda x: gr.update(visible=x), inputs=[advanced_checkbox], outputs=[advanced_settings] ) if __name__ == "__main__": demo.queue() demo.launch(ssr_mode=False, mcp_server=True)