Spaces:

Arrcttacsrks
/

VieNeu-TTS-Run-On-CPU2

Sleeping

App Files Files Community

VieNeu-TTS-Run-On-CPU2 / app.py

pnnbao-ump

Update app.py

f957b20 verified 11 days ago

raw

history blame

9.26 kB

	import gradio as gr
	import soundfile as sf
	import tempfile
	import os
	import time
	import numpy as np
	import librosa
	import re
	from pathlib import Path
	from llama_cpp import Llama

	print("⏳ Đang khởi động VieNeu-TTS...")

	# --- CONSTANTS ---
	MAX_CHARS_PER_CHUNK = 256
	SAMPLE_RATE = 24000

	# Đường dẫn model
	BACKBONE_REPO = "pnnbao-ump/VieNeu-TTS-q4-gguf"
	CODEC_REPO = "neuphonic/neucodec-onnx-decoder"

	# Giọng mẫu - cập nhật đường dẫn theo thư mục của bạn
	VOICE_SAMPLES = {
	"Vĩnh (nam miền Nam)": {
	"audio": "./sample/Vĩnh (nam miền Nam).wav",
	"text": "./sample/Vĩnh (nam miền Nam).txt",
	"codes": "./sample/Vĩnh (nam miền Nam).pt"
	},
	"Bình (nam miền Bắc)": {
	"audio": "./sample/Bình (nam miền Bắc).wav",
	"text": "./sample/Bình (nam miền Bắc).txt",
	"codes": "./sample/Bình (nam miền Bắc).pt"
	},
	"Ngọc (nữ miền Bắc)": {
	"audio": "./sample/Ngọc (nữ miền Bắc).wav",
	"text": "./sample/Ngọc (nữ miền Bắc).txt",
	"codes": "./sample/Ngọc (nữ miền Bắc).pt"
	},
	"Dung (nữ miền Nam)": {
	"audio": "./sample/Dung (nữ miền Nam).wav",
	"text": "./sample/Dung (nữ miền Nam).txt",
	"codes": "./sample/Dung (nữ miền Nam).pt"
	}
	}

	# --- CORE FUNCTIONS ---
	def phonemize_with_dict(text):
	"""Placeholder - thay bằng function thực tế từ utils"""
	# Import function phonemize thực tế của bạn
	from utils.phonemize_text import phonemize_with_dict as phonemize_real
	return phonemize_real(text)

	def split_text_into_chunks(text, max_chars=256):
	"""Chia text thành chunks"""
	sentences = re.split(r'([.!?,;])', text)
	chunks = []
	current = ""

	for i in range(0, len(sentences), 2):
	sentence = sentences[i]
	punct = sentences[i+1] if i+1 < len(sentences) else ""
	segment = sentence + punct

	if len(current) + len(segment) <= max_chars:
	current += segment
	else:
	if current:
	chunks.append(current.strip())
	current = segment

	if current:
	chunks.append(current.strip())

	return chunks if chunks else [text]

	def encode_reference(audio_path, codec):
	"""Encode reference audio"""
	wav, _ = librosa.load(audio_path, sr=16000, mono=True)
	wav_tensor = np.expand_dims(np.expand_dims(wav, 0), 0) # [1, 1, T]
	ref_codes = codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
	return ref_codes

	def decode_audio(codes_str, codec):
	"""Decode speech tokens to audio"""
	speech_ids = [int(num) for num in re.findall(r"<\\|speech_(\d+)\\|>", codes_str)]

	if len(speech_ids) == 0:
	raise ValueError("No valid speech tokens found")

	codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
	recon = codec.decode_code(codes)
	return recon[0, 0, :]

	# --- MODEL LOADING ---
	print("📦 Đang tải model Q4 GGUF...")
	try:
	from neucodec import NeuCodecOnnxDecoder

	backbone = Llama.from_pretrained(
	repo_id=BACKBONE_REPO,
	filename="*.gguf",
	verbose=False,
	n_gpu_layers=-1, # Dùng GPU nếu có
	n_ctx=2048,
	mlock=True,
	flash_attn=True,
	)

	codec = NeuCodecOnnxDecoder.from_pretrained(CODEC_REPO)

	print("✅ Model đã tải thành công!")
	model_loaded = True
	except Exception as e:
	print(f"❌ Lỗi khi tải model: {e}")
	model_loaded = False

	# --- SYNTHESIS FUNCTION ---
	def synthesize_speech(text, voice_choice):
	"""Main synthesis function"""
	if not model_loaded:
	yield None, "⚠️ Model chưa tải. Vui lòng khởi động lại!"
	return

	if not text or text.strip() == "":
	yield None, "⚠️ Vui lòng nhập văn bản!"
	return

	if voice_choice not in VOICE_SAMPLES:
	yield None, "⚠️ Vui lòng chọn giọng mẫu."
	return

	raw_text = text.strip()

	# Load reference
	ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
	ref_text_path = VOICE_SAMPLES[voice_choice]["text"]

	if not os.path.exists(ref_audio_path):
	yield None, "❌ Không tìm thấy file audio mẫu."
	return

	with open(ref_text_path, "r", encoding="utf-8") as f:
	ref_text_raw = f.read()

	yield None, "📄 Đang xử lý Reference..."

	# Encode reference
	try:
	ref_codes = encode_reference(ref_audio_path, codec)
	if hasattr(ref_codes, 'numpy'):
	ref_codes = ref_codes.numpy()
	except Exception as e:
	yield None, f"❌ Lỗi xử lý reference: {e}"
	return

	# Split text
	text_chunks = split_text_into_chunks(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
	total_chunks = len(text_chunks)

	yield None, f"🚀 Bắt đầu tổng hợp ({total_chunks} đoạn)..."

	all_audio_segments = []
	silence_pad = np.zeros(int(SAMPLE_RATE * 0.15), dtype=np.float32)

	start_time = time.time()

	try:
	for i, chunk in enumerate(text_chunks):
	yield None, f"⏳ Đang xử lý đoạn {i+1}/{total_chunks}..."

	# Phonemize
	ref_text_phoneme = phonemize_with_dict(ref_text_raw)
	input_text_phoneme = phonemize_with_dict(chunk)

	# Create prompt
	codes_str = "".join([f"<\|speech_{idx}\|>" for idx in ref_codes])
	prompt = (
	f"user: Convert the text to speech:<\|TEXT_PROMPT_START\|>{ref_text_phoneme} {input_text_phoneme}"
	f"<\|TEXT_PROMPT_END\|>\nassistant:<\|SPEECH_GENERATION_START\|>{codes_str}"
	)

	# Generate
	output = backbone(
	prompt,
	max_tokens=2048,
	temperature=1.0,
	top_k=50,
	stop=["<\|SPEECH_GENERATION_END\|>"],
	)
	output_str = output["choices"][0]["text"]

	# Decode
	chunk_wav = decode_audio(codes_str + output_str, codec)

	if chunk_wav is not None and len(chunk_wav) > 0:
	all_audio_segments.append(chunk_wav)
	if i < total_chunks - 1:
	all_audio_segments.append(silence_pad)

	if not all_audio_segments:
	yield None, "❌ Không sinh được audio nào."
	return

	yield None, "💾 Đang ghép file và lưu..."

	final_wav = np.concatenate(all_audio_segments)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	sf.write(tmp.name, final_wav, SAMPLE_RATE)
	output_path = tmp.name

	process_time = time.time() - start_time
	yield output_path, f"✅ Hoàn tất! (Tổng thời gian: {process_time:.2f}s)"

	except Exception as e:
	import traceback
	traceback.print_exc()
	yield None, f"❌ Lỗi: {str(e)}"

	# --- UI SETUP ---
	theme = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="cyan",
	font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui'],
	)

	css = """
	.header-box {
	text-align: center;
	margin-bottom: 25px;
	padding: 25px;
	background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
	border-radius: 12px;
	color: white;
	}
	.gradient-text {
	background: -webkit-linear-gradient(45deg, #60A5FA, #22D3EE);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 2.5rem;
	font-weight: 800;
	}
	"""

	initial_status = "✅ Model Q4 GGUF đã sẵn sàng!" if model_loaded else "❌ Lỗi khi tải model"

	with gr.Blocks(title="VieNeu-TTS") as demo:
	gr.HTML("""
	<div class="header-box">
	<h1 class="gradient-text">🦜 VieNeu-TTS Studio</h1>
	<p>Chạy với Q4 GGUF + ONNX Codec</p>
	</div>
	""")

	gr.Markdown(initial_status)

	with gr.Row():
	with gr.Column(scale=3):
	text_input = gr.Textbox(
	label=f"Văn bản (tối đa {MAX_CHARS_PER_CHUNK} ký tự/chunk)",
	lines=6,
	value="Hà Nội, trái tim của Việt Nam, là một thành phố ngàn năm văn hiến với bề dày lịch sử và văn hóa độc đáo.",
	)

	voice_select = gr.Dropdown(
	choices=list(VOICE_SAMPLES.keys()),
	value=list(VOICE_SAMPLES.keys())[0],
	label="Chọn giọng mẫu"
	)

	btn_generate = gr.Button("🎵 Bắt đầu tổng hợp", variant="primary", size="lg", interactive=model_loaded)

	with gr.Column(scale=2):
	audio_output = gr.Audio(label="Kết quả", type="filepath", autoplay=True)
	status_output = gr.Textbox(label="Trạng thái")

	btn_generate.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_select],
	outputs=[audio_output, status_output]
	)

	if __name__ == "__main__":
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	theme=theme,
	css=css
	)