import gradio as gr
import soundfile as sf
import tempfile
import os
import time
import numpy as np
import librosa
import re
from pathlib import Path
from llama_cpp import Llama

print("⏳ Đang khởi động VieNeu-TTS...")

# --- CONSTANTS ---
MAX_CHARS_PER_CHUNK = 256
SAMPLE_RATE = 24000

# Đường dẫn model
BACKBONE_REPO = "pnnbao-ump/VieNeu-TTS-q4-gguf"
CODEC_REPO = "neuphonic/neucodec-onnx-decoder"

# Giọng mẫu - cập nhật đường dẫn theo thư mục của bạn
VOICE_SAMPLES = {
  "Vĩnh (nam miền Nam)": {
    "audio": "./sample/Vĩnh (nam miền Nam).wav",
    "text": "./sample/Vĩnh (nam miền Nam).txt",
    "codes": "./sample/Vĩnh (nam miền Nam).pt"
  },
  "Bình (nam miền Bắc)": {
    "audio": "./sample/Bình (nam miền Bắc).wav",
    "text": "./sample/Bình (nam miền Bắc).txt",
    "codes": "./sample/Bình (nam miền Bắc).pt"
  },
  "Ngọc (nữ miền Bắc)": {
    "audio": "./sample/Ngọc (nữ miền Bắc).wav",
    "text": "./sample/Ngọc (nữ miền Bắc).txt",
    "codes": "./sample/Ngọc (nữ miền Bắc).pt"
  },
  "Dung (nữ miền Nam)": {
    "audio": "./sample/Dung (nữ miền Nam).wav",
    "text": "./sample/Dung (nữ miền Nam).txt",
    "codes": "./sample/Dung (nữ miền Nam).pt"
  }
}

# --- CORE FUNCTIONS ---
def phonemize_with_dict(text):
    """Placeholder - thay bằng function thực tế từ utils"""
    # Import function phonemize thực tế của bạn
    from utils.phonemize_text import phonemize_with_dict as phonemize_real
    return phonemize_real(text)

def split_text_into_chunks(text, max_chars=256):
    """Chia text thành chunks"""
    sentences = re.split(r'([.!?,;])', text)
    chunks = []
    current = ""
    
    for i in range(0, len(sentences), 2):
        sentence = sentences[i]
        punct = sentences[i+1] if i+1 < len(sentences) else ""
        segment = sentence + punct
        
        if len(current) + len(segment) <= max_chars:
            current += segment
        else:
            if current:
                chunks.append(current.strip())
            current = segment
    
    if current:
        chunks.append(current.strip())
    
    return chunks if chunks else [text]

def encode_reference(audio_path, codec):
    """Encode reference audio"""
    wav, _ = librosa.load(audio_path, sr=16000, mono=True)
    wav_tensor = np.expand_dims(np.expand_dims(wav, 0), 0)  # [1, 1, T]
    ref_codes = codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
    return ref_codes

def decode_audio(codes_str, codec):
    """Decode speech tokens to audio"""
    speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes_str)]
    
    if len(speech_ids) == 0:
        raise ValueError("No valid speech tokens found")
    
    codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
    recon = codec.decode_code(codes)
    return recon[0, 0, :]

# --- MODEL LOADING ---
print("📦 Đang tải model Q4 GGUF...")
try:
    from neucodec import NeuCodecOnnxDecoder
    
    backbone = Llama.from_pretrained(
        repo_id=BACKBONE_REPO,
        filename="*.gguf",
        verbose=False,
        n_gpu_layers=-1,  # Dùng GPU nếu có
        n_ctx=2048,
        mlock=True,
        flash_attn=True,
    )
    
    codec = NeuCodecOnnxDecoder.from_pretrained(CODEC_REPO)
    
    print("✅ Model đã tải thành công!")
    model_loaded = True
except Exception as e:
    print(f"❌ Lỗi khi tải model: {e}")
    model_loaded = False

# --- SYNTHESIS FUNCTION ---
def synthesize_speech(text, voice_choice):
    """Main synthesis function"""
    if not model_loaded:
        yield None, "⚠️ Model chưa tải. Vui lòng khởi động lại!"
        return
    
    if not text or text.strip() == "":
        yield None, "⚠️ Vui lòng nhập văn bản!"
        return
    
    if voice_choice not in VOICE_SAMPLES:
        yield None, "⚠️ Vui lòng chọn giọng mẫu."
        return
    
    raw_text = text.strip()
    
    # Load reference
    ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
    ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
    
    if not os.path.exists(ref_audio_path):
        yield None, "❌ Không tìm thấy file audio mẫu."
        return
    
    with open(ref_text_path, "r", encoding="utf-8") as f:
        ref_text_raw = f.read()
    
    yield None, "📄 Đang xử lý Reference..."
    
    # Encode reference
    try:
        ref_codes = encode_reference(ref_audio_path, codec)
        if hasattr(ref_codes, 'numpy'):
            ref_codes = ref_codes.numpy()
    except Exception as e:
        yield None, f"❌ Lỗi xử lý reference: {e}"
        return
    
    # Split text
    text_chunks = split_text_into_chunks(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
    total_chunks = len(text_chunks)
    
    yield None, f"🚀 Bắt đầu tổng hợp ({total_chunks} đoạn)..."
    
    all_audio_segments = []
    silence_pad = np.zeros(int(SAMPLE_RATE * 0.15), dtype=np.float32)
    
    start_time = time.time()
    
    try:
        for i, chunk in enumerate(text_chunks):
            yield None, f"⏳ Đang xử lý đoạn {i+1}/{total_chunks}..."
            
            # Phonemize
            ref_text_phoneme = phonemize_with_dict(ref_text_raw)
            input_text_phoneme = phonemize_with_dict(chunk)
            
            # Create prompt
            codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
            prompt = (
                f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phoneme} {input_text_phoneme}"
                f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
            )
            
            # Generate
            output = backbone(
                prompt,
                max_tokens=2048,
                temperature=1.0,
                top_k=50,
                stop=["<|SPEECH_GENERATION_END|>"],
            )
            output_str = output["choices"][0]["text"]
            
            # Decode
            chunk_wav = decode_audio(codes_str + output_str, codec)
            
            if chunk_wav is not None and len(chunk_wav) > 0:
                all_audio_segments.append(chunk_wav)
                if i < total_chunks - 1:
                    all_audio_segments.append(silence_pad)
        
        if not all_audio_segments:
            yield None, "❌ Không sinh được audio nào."
            return
        
        yield None, "💾 Đang ghép file và lưu..."
        
        final_wav = np.concatenate(all_audio_segments)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            sf.write(tmp.name, final_wav, SAMPLE_RATE)
            output_path = tmp.name
        
        process_time = time.time() - start_time
        yield output_path, f"✅ Hoàn tất! (Tổng thời gian: {process_time:.2f}s)"
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        yield None, f"❌ Lỗi: {str(e)}"

# --- UI SETUP ---
theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="cyan",
    font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui'],
)

css = """
.header-box {
    text-align: center;
    margin-bottom: 25px;
    padding: 25px;
    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
    border-radius: 12px;
    color: white;
}
.gradient-text {
    background: -webkit-linear-gradient(45deg, #60A5FA, #22D3EE);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 2.5rem;
    font-weight: 800;
}
"""

initial_status = "✅ Model Q4 GGUF đã sẵn sàng!" if model_loaded else "❌ Lỗi khi tải model"

with gr.Blocks(title="VieNeu-TTS") as demo:
    gr.HTML("""
    <div class="header-box">
        <h1 class="gradient-text">🦜 VieNeu-TTS Studio</h1>
        <p>Chạy với Q4 GGUF + ONNX Codec</p>
    </div>
    """)
    
    gr.Markdown(initial_status)
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label=f"Văn bản (tối đa {MAX_CHARS_PER_CHUNK} ký tự/chunk)",
                lines=6,
                value="Hà Nội, trái tim của Việt Nam, là một thành phố ngàn năm văn hiến với bề dày lịch sử và văn hóa độc đáo.",
            )
            
            voice_select = gr.Dropdown(
                choices=list(VOICE_SAMPLES.keys()),
                value=list(VOICE_SAMPLES.keys())[0],
                label="Chọn giọng mẫu"
            )
            
            btn_generate = gr.Button("🎵 Bắt đầu tổng hợp", variant="primary", size="lg", interactive=model_loaded)
        
        with gr.Column(scale=2):
            audio_output = gr.Audio(label="Kết quả", type="filepath", autoplay=True)
            status_output = gr.Textbox(label="Trạng thái")
    
    btn_generate.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_select],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=7860,
        theme=theme,
        css=css
    )