# @title Interface
import gradio as gr
from TTS.api import TTS
import fitz  # PyMuPDF
from ebooklib import epub
from bs4 import BeautifulSoup
import os
import re
from pydub import AudioSegment
import textwrap
import uuid
import torch
from ebooklib import ITEM_DOCUMENT
import nltk
import subprocess

nltk.download("punkt_tab")
nltk.download("punkt")
from transformers import pipeline

os.makedirs("audios", exist_ok=True)

SPEAKER_WAV = "voz/voz_clonada_sample.wav"
OUTPUT_DIR = "audios"
MAX_CHARS_PER_SEGMENT = 203
LANGUAGE = "pt"
EMOTION_LABELS = [
    "neutral",
    "happy",
    "sad",
    "angry",
    "excited",
    "sleepy",
    "whispering",
    "shouting",
]

os.environ["COQUI_TOS_AGREED"] = "1"

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import XttsAudioConfig

# torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])

print("Carregando modelo de classificação de texto (Zero-Shot)...")
try:
    classifier = pipeline(
        "zero-shot-classification",
        model="neuralmind/bert-base-portuguese-cased",
        device=0 if torch.cuda.is_available() else -1,
    )
    print("Modelo 'neuralmind/bert-base-portuguese-cased' carregado com sucesso!")
except Exception as e:
    print(f"Erro ao carregar neuralmind/bert-base-portuguese-cased: {e}")
    classifier = None

tts = TTS(
    model_name="tts_models/multilingual/multi-dataset/xtts_v2",
    progress_bar=True,
    gpu=False,
)


def extract_text(file):
    ext = os.path.splitext(file.name)[-1].lower()
    text = ""
    if ext == ".pdf":
        doc = fitz.open(file.name)
        for page in doc:
            text += page.get_text()
    elif ext == ".epub":
        book = epub.read_epub(file.name)
        for item in book.get_items():
            if item.get_type() == ITEM_DOCUMENT:
                soup = BeautifulSoup(item.get_content(), "html.parser")
                text += soup.get_text() + "\n\n"
    return text.strip()


def clean_text(text):
    """Remove as marcações de fonte do texto."""
    while "." in text:
        text = text.replace(".", "")
    while "  " in text:
        text = text.replace("  ", " ")
    return text.strip()


def segment_text(text, max_chars):
    """
    Segmenta o texto em frases e, em seguida, em blocos menores
    respeitando o limite de caracteres, sem cortar palavras e inserindo vírgulas.
    """
    sentences = nltk.sent_tokenize(text, language="portuguese")
    segments = []

    for sentence in sentences:
        if len(sentence) <= max_chars:
            segments.append(sentence)
        else:
            words = sentence.split(" ")
            temp_segment = ""
            for word in words:
                if (
                    len(temp_segment) + len(word) + (1 if temp_segment else 0)
                    <= max_chars
                ):
                    if temp_segment:
                        temp_segment += " " + word
                    else:
                        temp_segment = word
                else:
                    if temp_segment and not temp_segment.endswith((".", "!", "?", ",")):
                        temp_segment += ","
                    segments.append(temp_segment)
                    temp_segment = word
            if temp_segment:
                segments.append(temp_segment)

    final_segments = []
    for segment in segments:
        if len(segment) > max_chars:
            start_idx = 0
            while start_idx < len(segment):
                sub_segment = segment[start_idx : start_idx + max_chars]

                cut_point = len(sub_segment)
                if cut_point < len(segment):
                    last_space = sub_segment.rfind(" ")

                    last_punctuation = re.search(r"[.?!,;:]", sub_segment[::-1])

                    if last_punctuation:
                        cut_point = len(sub_segment) - last_punctuation.start()
                    elif last_space != -1:
                        cut_point = last_space

                    if cut_point < len(sub_segment) and sub_segment[cut_point] != " ":
                        pass

                    if cut_point < len(sub_segment) and not sub_segment[
                        cut_point - 1
                    ].endswith((".", "!", "?", ",")):
                        sub_segment_to_add = sub_segment[:cut_point].strip()
                        if sub_segment_to_add:
                            sub_segment_to_add += ","
                        final_segments.append(sub_segment_to_add)
                    else:
                        final_segments.append(sub_segment[:cut_point].strip())
                    start_idx += cut_point
                else:
                    final_segments.append(sub_segment.strip())
                    start_idx += max_chars

        else:
            final_segments.append(segment.strip())

    return [s for s in final_segments if s]


def classify_emotion(text, labels):
    """Classifica a emoção do texto usando o modelo zero-shot."""
    if not classifier:
        return "neutral"

    result = classifier(text, labels, multi_label=False)

    predicted_label = result["labels"][0]
    # print(f"Texto: {text} - Resultado: {result}")

    # if result['scores'][0] < 0.5:
    #    if "neutra" in labels and result['scores'][labels.index("neutra")] > result['scores'][0]:
    #        return "neutral"
    #    return "neutral"

    return predicted_label


def narrar(texto, progress=gr.Progress()):
    if not texto:
        return None, None

    arquivos_wav_temporarios = []
    blocos = segment_text(texto, MAX_CHARS_PER_SEGMENT)
    tmp_id = uuid.uuid4()
    mp3_path = f"{OUTPUT_DIR}/{tmp_id}.mp3"
    pausa_audio = AudioSegment.silent(duration=200)
    pausa_wav_path = os.path.join(OUTPUT_DIR, f"{tmp_id}_pausa.wav")
    pausa_audio.export(pausa_wav_path, format="wav")
    arquivos_wav_temporarios.append(pausa_wav_path)

    try:
        for i, bloco in enumerate(blocos):
            emocao = classify_emotion(bloco, EMOTION_LABELS)
            wav_temp = f"{OUTPUT_DIR}/{tmp_id}_parte_{i:04d}.wav"
            arquivos_wav_temporarios.append(wav_temp)

            tts.tts_to_file(
                text=clean_text(bloco),
                speaker_wav=SPEAKER_WAV,
                language=LANGUAGE,
                emotion=emocao,
                file_path=wav_temp,
            )

            progress(((i + 1) / len(blocos)))

        lista_input_path = os.path.join(OUTPUT_DIR, f"{tmp_id}_input.txt")
        with open(lista_input_path, "w") as f:
            for i, bloco in enumerate(blocos):
                wav_temp = f"{OUTPUT_DIR}/{tmp_id}_parte_{i:04d}.wav"
                f.write(f"file '{os.path.basename(wav_temp)}'\n")
                f.write(f"file '{os.path.basename(pausa_wav_path)}'\n")

        ffmpeg_command = [
            "ffmpeg",
            "-f",
            "concat",
            "-safe",
            "0",
            "-i",
            os.path.basename(lista_input_path),
            "-c:a",
            "libmp3lame",
            "-q:a",
            "0",
            "-y",
            os.path.basename(mp3_path),
        ]

        subprocess.run(ffmpeg_command, check=True, cwd=OUTPUT_DIR)

        return mp3_path, mp3_path

    except subprocess.CalledProcessError as e:
        print(f"Erro ao executar FFmpeg: {e}")
        print(f"Stdout: {e.stdout.decode()}")
        print(f"Stderr: {e.stderr.decode()}")
        return None, None
    except Exception as e:
        print(f"Ocorreu um erro: {e}")
        return None, None
    finally:
        for wav_file in arquivos_wav_temporarios:
            if os.path.exists(wav_file):
                os.remove(wav_file)
        if os.path.exists(lista_input_path):
            os.remove(lista_input_path)
        print("Arquivos temporários limpos.")


with gr.Blocks() as demo:
    gr.Markdown("## Narrador XTTS com sua voz preferida 🎤")

    with gr.Row():
        arquivo = gr.File(label="Carregar PDF ou EPUB", file_types=[".pdf", ".epub"])

    texto_extraido = gr.Textbox(label="Texto extraído", lines=10)
    carregar_btn = gr.Button("Extrair texto")

    carregar_btn.click(fn=extract_text, inputs=arquivo, outputs=texto_extraido)

    with gr.Row():
        narrar_btn = gr.Button("Narrar")
        audio_saida = gr.Audio(label="Ouvir áudio")
        download_mp3 = gr.File(label="Baixar em MP3")

    narrar_btn.click(
        fn=narrar, inputs=[texto_extraido], outputs=[audio_saida, download_mp3]
    )

demo.launch(debug=True, share=True, pwa=True, mcp_server=True)