import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import json
from typing import List, Dict, Any, Optional
import logging
import spaces
import os

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configuration
MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"  # Main repo for config and chat template
INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4"  # Int4 quantized model
LOCAL_MODEL_PATH = "./int4"  # Local int4 weights
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Global variables for model and tokenizer
model = None
tokenizer = None

# Default system prompt
DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."

# Title and description content
title = "# 🤖 Petite Elle L'Aime 3 - Chat Interface"
description = "A fine-tuned version of SmolLM3-3B optimized for French and multilingual conversations. This is the int4 quantized version for efficient CPU deployment."
presentation1 = """
### 🎯 Features
- **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
- **Int4 Quantization**: Optimized for CPU deployment with ~50% memory reduction
- **Interactive Chat Interface**: Real-time conversation with the model
- **Customizable System Prompt**: Define the assistant's personality and behavior
- **Thinking Mode**: Enable reasoning mode with thinking tags
"""
presentation2 = """
### 📋 Model Information
- **Base Model**: SmolLM3-3B
- **Parameters**: ~3B
- **Context Length**: 128k
- **Languages**: English, French, Italian, Portuguese, Chinese, Arabic
- **Device**: CPU optimized
- **Quantization**: int4
"""
joinus = """
### 🚀 Quick Start
1. Add context in the system prompt
2. Type your message
3. Click generate to start chatting
4. Use advanced settings for fine-tuning
"""

def check_local_model():
    """Check if local int4 model files exist"""
    required_files = [
        "config.json",
        "pytorch_model.bin",
        "tokenizer.json",
        "tokenizer_config.json"
    ]
    
    for file in required_files:
        file_path = os.path.join(LOCAL_MODEL_PATH, file)
        if not os.path.exists(file_path):
            logger.warning(f"Missing required file: {file_path}")
            return False
    
    logger.info("All required model files found locally")
    return True

def load_model():
    """Load the model and tokenizer"""
    global model, tokenizer
    
    try:
        # Check if local model exists (downloaded during build)
        if check_local_model():
            logger.info(f"Loading tokenizer from {LOCAL_MODEL_PATH}")
            tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
            
            logger.info(f"Loading int4 model from {LOCAL_MODEL_PATH}")
            model = AutoModelForCausalLM.from_pretrained(
                LOCAL_MODEL_PATH,
                device_map="auto" if DEVICE == "cuda" else "cpu",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
        else:
            logger.info(f"Local model not found, loading from {MAIN_MODEL_ID}")
            
            # Load tokenizer from main repo (for chat template and config)
            tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
            
            logger.info(f"Loading int4 model from {INT4_MODEL_ID}")
            
            # Load model with int4 quantization from Hugging Face
            model = AutoModelForCausalLM.from_pretrained(
                INT4_MODEL_ID,
                device_map="auto" if DEVICE == "cuda" else "cpu",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
        
        # Set pad token if not present
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
        
        logger.info("Model loaded successfully")
        return True
        
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        return False

def create_prompt(system_message, user_message, enable_thinking=True):
    """Create prompt using the model's chat template"""
    try:
        # Prepare messages for the template
        formatted_messages = []
        
        # Add system message if provided
        if system_message and system_message.strip():
            formatted_messages.append({"role": "system", "content": system_message})
        
        # Add user message
        formatted_messages.append({"role": "user", "content": user_message})
        
        # Apply the chat template
        prompt = tokenizer.apply_chat_template(
            formatted_messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        
        # Add  /no_think to the end of prompt when thinking is disabled
        if not enable_thinking:
            prompt += "  /no_think"
        
        return prompt
        
    except Exception as e:
        logger.error(f"Error creating prompt: {e}")
        return ""

@spaces.GPU(duration=94)
def generate_response(message, history, system_message, max_tokens, temperature, top_p, do_sample, enable_thinking=True):
    """Generate response using the model"""
    global model, tokenizer
    
    if model is None or tokenizer is None:
        return "Error: Model not loaded. Please wait for the model to load."
    
    try:
        # Create prompt using chat template
        full_prompt = create_prompt(system_message, message, enable_thinking)
        
        if not full_prompt:
            return "Error: Failed to create prompt."
        
        # Tokenize the input
        inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
        
        # Move to device
        if DEVICE == "cuda":
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            output_ids = model.generate(
                inputs['input_ids'],
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=do_sample,
                attention_mask=inputs['attention_mask'],
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode the response
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        # Extract only the new response (remove the input prompt)
        assistant_response = response[len(full_prompt):].strip()
        
        # Clean up the response - only remove special tokens, preserve thinking tags when enabled
        assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
        
        # Only remove thinking tags if thinking mode is disabled
        if not enable_thinking:
            assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
        
        assistant_response = assistant_response.strip()
        
        return assistant_response
        
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        return f"Error generating response: {str(e)}"

def user(user_message, history):
    """Add user message to history"""
    return "", history + [[user_message, None]]

def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking):
    """Generate bot response"""
    user_message = history[-1][0]
    do_sample = advanced_checkbox
    bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking)
    history[-1][1] = bot_message
    return history

# Load model on startup
logger.info("Starting model loading process...")
load_model()

# Create Gradio interface
with gr.Blocks() as demo:
    with gr.Row(): 
        gr.Markdown(title)
    with gr.Row():
        gr.Markdown(description)
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(presentation1)
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(presentation2)
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(joinus)
        with gr.Column(scale=1):
            pass  # Empty column for balance
    
    with gr.Row():
        with gr.Column(scale=2):
            system_prompt = gr.TextArea(
                label="📑 Context", 
                placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.", 
                lines=5,
                value=DEFAULT_SYSTEM_PROMPT
            )
            user_input = gr.TextArea(
                label="🤷🏻‍♂️ User Input", 
                placeholder="Hi there my name is Tonic!", 
                lines=2
            )
            advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
            with gr.Column(visible=False) as advanced_settings:
                max_length = gr.Slider(
                    label="📏 Max Length", 
                    minimum=64, 
                    maximum=2048, 
                    value=512, 
                    step=64
                )
                temperature = gr.Slider(
                    label="🌡️ Temperature", 
                    minimum=0.01, 
                    maximum=1.0, 
                    value=0.7, 
                    step=0.01
                )
                top_p = gr.Slider(
                    label="⚛️ Top-p (Nucleus Sampling)", 
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.9, 
                    step=0.01
                )
                enable_thinking = gr.Checkbox(label="Enable Thinking Mode", value=True)
            
            generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="🤖 Petite Elle L'Aime 3")
    
    generate_button.click(
        user,
        [user_input, chatbot],
        [user_input, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking],
        chatbot
    )

    advanced_checkbox.change(
        fn=lambda x: gr.update(visible=x),
        inputs=[advanced_checkbox],
        outputs=[advanced_settings]
    )

if __name__ == "__main__":
    demo.queue()
    demo.launch(ssr_mode=False, mcp_server=True)