# @title Interface import gradio as gr from TTS.api import TTS import fitz # PyMuPDF from ebooklib import epub from bs4 import BeautifulSoup import os import re from pydub import AudioSegment import textwrap import uuid import torch from ebooklib import ITEM_DOCUMENT import nltk import subprocess nltk.download("punkt_tab") nltk.download("punkt") from transformers import pipeline os.makedirs("audios", exist_ok=True) SPEAKER_WAV = "voz/voz_clonada_sample.wav" OUTPUT_DIR = "audios" MAX_CHARS_PER_SEGMENT = 203 LANGUAGE = "pt" EMOTION_LABELS = [ "neutral", "happy", "sad", "angry", "excited", "sleepy", "whispering", "shouting", ] os.environ["COQUI_TOS_AGREED"] = "1" from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import XttsArgs from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.models.xtts import XttsAudioConfig # torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs]) print("Carregando modelo de classificação de texto (Zero-Shot)...") try: classifier = pipeline( "zero-shot-classification", model="neuralmind/bert-base-portuguese-cased", device=0 if torch.cuda.is_available() else -1, ) print("Modelo 'neuralmind/bert-base-portuguese-cased' carregado com sucesso!") except Exception as e: print(f"Erro ao carregar neuralmind/bert-base-portuguese-cased: {e}") classifier = None tts = TTS( model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True, gpu=False, ) def extract_text(file): ext = os.path.splitext(file.name)[-1].lower() text = "" if ext == ".pdf": doc = fitz.open(file.name) for page in doc: text += page.get_text() elif ext == ".epub": book = epub.read_epub(file.name) for item in book.get_items(): if item.get_type() == ITEM_DOCUMENT: soup = BeautifulSoup(item.get_content(), "html.parser") text += soup.get_text() + "\n\n" return text.strip() def clean_text(text): """Remove as marcações de fonte do texto.""" while "." in text: text = text.replace(".", "") while " " in text: text = text.replace(" ", " ") return text.strip() def segment_text(text, max_chars): """ Segmenta o texto em frases e, em seguida, em blocos menores respeitando o limite de caracteres, sem cortar palavras e inserindo vírgulas. """ sentences = nltk.sent_tokenize(text, language="portuguese") segments = [] for sentence in sentences: if len(sentence) <= max_chars: segments.append(sentence) else: words = sentence.split(" ") temp_segment = "" for word in words: if ( len(temp_segment) + len(word) + (1 if temp_segment else 0) <= max_chars ): if temp_segment: temp_segment += " " + word else: temp_segment = word else: if temp_segment and not temp_segment.endswith((".", "!", "?", ",")): temp_segment += "," segments.append(temp_segment) temp_segment = word if temp_segment: segments.append(temp_segment) final_segments = [] for segment in segments: if len(segment) > max_chars: start_idx = 0 while start_idx < len(segment): sub_segment = segment[start_idx : start_idx + max_chars] cut_point = len(sub_segment) if cut_point < len(segment): last_space = sub_segment.rfind(" ") last_punctuation = re.search(r"[.?!,;:]", sub_segment[::-1]) if last_punctuation: cut_point = len(sub_segment) - last_punctuation.start() elif last_space != -1: cut_point = last_space if cut_point < len(sub_segment) and sub_segment[cut_point] != " ": pass if cut_point < len(sub_segment) and not sub_segment[ cut_point - 1 ].endswith((".", "!", "?", ",")): sub_segment_to_add = sub_segment[:cut_point].strip() if sub_segment_to_add: sub_segment_to_add += "," final_segments.append(sub_segment_to_add) else: final_segments.append(sub_segment[:cut_point].strip()) start_idx += cut_point else: final_segments.append(sub_segment.strip()) start_idx += max_chars else: final_segments.append(segment.strip()) return [s for s in final_segments if s] def classify_emotion(text, labels): """Classifica a emoção do texto usando o modelo zero-shot.""" if not classifier: return "neutral" result = classifier(text, labels, multi_label=False) predicted_label = result["labels"][0] # print(f"Texto: {text} - Resultado: {result}") # if result['scores'][0] < 0.5: # if "neutra" in labels and result['scores'][labels.index("neutra")] > result['scores'][0]: # return "neutral" # return "neutral" return predicted_label def narrar(texto, progress=gr.Progress()): if not texto: return None, None arquivos_wav_temporarios = [] blocos = segment_text(texto, MAX_CHARS_PER_SEGMENT) tmp_id = uuid.uuid4() mp3_path = f"{OUTPUT_DIR}/{tmp_id}.mp3" pausa_audio = AudioSegment.silent(duration=200) pausa_wav_path = os.path.join(OUTPUT_DIR, f"{tmp_id}_pausa.wav") pausa_audio.export(pausa_wav_path, format="wav") arquivos_wav_temporarios.append(pausa_wav_path) try: for i, bloco in enumerate(blocos): emocao = classify_emotion(bloco, EMOTION_LABELS) wav_temp = f"{OUTPUT_DIR}/{tmp_id}_parte_{i:04d}.wav" arquivos_wav_temporarios.append(wav_temp) tts.tts_to_file( text=clean_text(bloco), speaker_wav=SPEAKER_WAV, language=LANGUAGE, emotion=emocao, file_path=wav_temp, ) progress(((i + 1) / len(blocos))) lista_input_path = os.path.join(OUTPUT_DIR, f"{tmp_id}_input.txt") with open(lista_input_path, "w") as f: for i, bloco in enumerate(blocos): wav_temp = f"{OUTPUT_DIR}/{tmp_id}_parte_{i:04d}.wav" f.write(f"file '{os.path.basename(wav_temp)}'\n") f.write(f"file '{os.path.basename(pausa_wav_path)}'\n") ffmpeg_command = [ "ffmpeg", "-f", "concat", "-safe", "0", "-i", os.path.basename(lista_input_path), "-c:a", "libmp3lame", "-q:a", "0", "-y", os.path.basename(mp3_path), ] subprocess.run(ffmpeg_command, check=True, cwd=OUTPUT_DIR) return mp3_path, mp3_path except subprocess.CalledProcessError as e: print(f"Erro ao executar FFmpeg: {e}") print(f"Stdout: {e.stdout.decode()}") print(f"Stderr: {e.stderr.decode()}") return None, None except Exception as e: print(f"Ocorreu um erro: {e}") return None, None finally: for wav_file in arquivos_wav_temporarios: if os.path.exists(wav_file): os.remove(wav_file) if os.path.exists(lista_input_path): os.remove(lista_input_path) print("Arquivos temporários limpos.") with gr.Blocks() as demo: gr.Markdown("## Narrador XTTS com sua voz preferida 🎤") with gr.Row(): arquivo = gr.File(label="Carregar PDF ou EPUB", file_types=[".pdf", ".epub"]) texto_extraido = gr.Textbox(label="Texto extraído", lines=10) carregar_btn = gr.Button("Extrair texto") carregar_btn.click(fn=extract_text, inputs=arquivo, outputs=texto_extraido) with gr.Row(): narrar_btn = gr.Button("Narrar") audio_saida = gr.Audio(label="Ouvir áudio") download_mp3 = gr.File(label="Baixar em MP3") narrar_btn.click( fn=narrar, inputs=[texto_extraido], outputs=[audio_saida, download_mp3] ) demo.launch(debug=True, share=True, pwa=True, mcp_server=True)