import gradio as gr import soundfile as sf import tempfile import os import time import numpy as np import librosa import re from pathlib import Path from llama_cpp import Llama print("⏳ Đang khởi động VieNeu-TTS...") # --- CONSTANTS --- MAX_CHARS_PER_CHUNK = 256 SAMPLE_RATE = 24000 # Đường dẫn model BACKBONE_REPO = "pnnbao-ump/VieNeu-TTS-q4-gguf" CODEC_REPO = "neuphonic/neucodec-onnx-decoder" # Giọng mẫu - cập nhật đường dẫn theo thư mục của bạn VOICE_SAMPLES = { "Vĩnh (nam miền Nam)": { "audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt", "codes": "./sample/Vĩnh (nam miền Nam).pt" }, "Bình (nam miền Bắc)": { "audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt", "codes": "./sample/Bình (nam miền Bắc).pt" }, "Ngọc (nữ miền Bắc)": { "audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt", "codes": "./sample/Ngọc (nữ miền Bắc).pt" }, "Dung (nữ miền Nam)": { "audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt", "codes": "./sample/Dung (nữ miền Nam).pt" } } # --- CORE FUNCTIONS --- def phonemize_with_dict(text): """Placeholder - thay bằng function thực tế từ utils""" # Import function phonemize thực tế của bạn from utils.phonemize_text import phonemize_with_dict as phonemize_real return phonemize_real(text) def split_text_into_chunks(text, max_chars=256): """Chia text thành chunks""" sentences = re.split(r'([.!?,;])', text) chunks = [] current = "" for i in range(0, len(sentences), 2): sentence = sentences[i] punct = sentences[i+1] if i+1 < len(sentences) else "" segment = sentence + punct if len(current) + len(segment) <= max_chars: current += segment else: if current: chunks.append(current.strip()) current = segment if current: chunks.append(current.strip()) return chunks if chunks else [text] def encode_reference(audio_path, codec): """Encode reference audio""" wav, _ = librosa.load(audio_path, sr=16000, mono=True) wav_tensor = np.expand_dims(np.expand_dims(wav, 0), 0) # [1, 1, T] ref_codes = codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0) return ref_codes def decode_audio(codes_str, codec): """Decode speech tokens to audio""" speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes_str)] if len(speech_ids) == 0: raise ValueError("No valid speech tokens found") codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :] recon = codec.decode_code(codes) return recon[0, 0, :] # --- MODEL LOADING --- print("📦 Đang tải model Q4 GGUF...") try: from neucodec import NeuCodecOnnxDecoder backbone = Llama.from_pretrained( repo_id=BACKBONE_REPO, filename="*.gguf", verbose=False, n_gpu_layers=-1, # Dùng GPU nếu có n_ctx=2048, mlock=True, flash_attn=True, ) codec = NeuCodecOnnxDecoder.from_pretrained(CODEC_REPO) print("✅ Model đã tải thành công!") model_loaded = True except Exception as e: print(f"❌ Lỗi khi tải model: {e}") model_loaded = False # --- SYNTHESIS FUNCTION --- def synthesize_speech(text, voice_choice): """Main synthesis function""" if not model_loaded: yield None, "⚠️ Model chưa tải. Vui lòng khởi động lại!" return if not text or text.strip() == "": yield None, "⚠️ Vui lòng nhập văn bản!" return if voice_choice not in VOICE_SAMPLES: yield None, "⚠️ Vui lòng chọn giọng mẫu." return raw_text = text.strip() # Load reference ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"] ref_text_path = VOICE_SAMPLES[voice_choice]["text"] if not os.path.exists(ref_audio_path): yield None, "❌ Không tìm thấy file audio mẫu." return with open(ref_text_path, "r", encoding="utf-8") as f: ref_text_raw = f.read() yield None, "📄 Đang xử lý Reference..." # Encode reference try: ref_codes = encode_reference(ref_audio_path, codec) if hasattr(ref_codes, 'numpy'): ref_codes = ref_codes.numpy() except Exception as e: yield None, f"❌ Lỗi xử lý reference: {e}" return # Split text text_chunks = split_text_into_chunks(raw_text, max_chars=MAX_CHARS_PER_CHUNK) total_chunks = len(text_chunks) yield None, f"🚀 Bắt đầu tổng hợp ({total_chunks} đoạn)..." all_audio_segments = [] silence_pad = np.zeros(int(SAMPLE_RATE * 0.15), dtype=np.float32) start_time = time.time() try: for i, chunk in enumerate(text_chunks): yield None, f"⏳ Đang xử lý đoạn {i+1}/{total_chunks}..." # Phonemize ref_text_phoneme = phonemize_with_dict(ref_text_raw) input_text_phoneme = phonemize_with_dict(chunk) # Create prompt codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes]) prompt = ( f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phoneme} {input_text_phoneme}" f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}" ) # Generate output = backbone( prompt, max_tokens=2048, temperature=1.0, top_k=50, stop=["<|SPEECH_GENERATION_END|>"], ) output_str = output["choices"][0]["text"] # Decode chunk_wav = decode_audio(codes_str + output_str, codec) if chunk_wav is not None and len(chunk_wav) > 0: all_audio_segments.append(chunk_wav) if i < total_chunks - 1: all_audio_segments.append(silence_pad) if not all_audio_segments: yield None, "❌ Không sinh được audio nào." return yield None, "💾 Đang ghép file và lưu..." final_wav = np.concatenate(all_audio_segments) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: sf.write(tmp.name, final_wav, SAMPLE_RATE) output_path = tmp.name process_time = time.time() - start_time yield output_path, f"✅ Hoàn tất! (Tổng thời gian: {process_time:.2f}s)" except Exception as e: import traceback traceback.print_exc() yield None, f"❌ Lỗi: {str(e)}" # --- UI SETUP --- theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="cyan", font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui'], ) css = """ .header-box { text-align: center; margin-bottom: 25px; padding: 25px; background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%); border-radius: 12px; color: white; } .gradient-text { background: -webkit-linear-gradient(45deg, #60A5FA, #22D3EE); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5rem; font-weight: 800; } """ initial_status = "✅ Model Q4 GGUF đã sẵn sàng!" if model_loaded else "❌ Lỗi khi tải model" with gr.Blocks(title="VieNeu-TTS") as demo: gr.HTML("""
Chạy với Q4 GGUF + ONNX Codec