Spaces:

alakxender
/

chatterbox-tts-dhivehi

Running on Zero

088b524 about 1 month ago

36 kB

	import spaces
	from pathlib import Path
	import os
	import sys

	# Add current directory to Python path for HuggingFace Spaces
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from chatterbox.tts import ChatterboxTTS
	import torchaudio
	import torch
	import random
	import numpy as np
	import gradio as gr
	import tempfile
	import chatterbox_dhivehi
	import warnings

	warnings.filterwarnings("ignore")

	chatterbox_dhivehi.extend_dhivehi()

	# Global variables
	MODEL = None
	_target = Path.home() / ".chatterbox-tts-dhivehi"

	def download_model():
	"""Download model files from HuggingFace if not already present"""
	try:
	from huggingface_hub import snapshot_download

	print("=" * 60)
	print("Checking model files...")
	print(f"Target directory: {_target}")

	if not (_target.exists() and any(_target.rglob("*"))):
	print("Model files not found. Starting download...")
	print("This may take a few minutes on first run.")
	print("=" * 60)

	snapshot_download(
	repo_id="alakxender/chatterbox-tts-dhivehi",
	local_dir=str(_target),
	local_dir_use_symlinks=False,
	resume_download=True,
	force_download=True,
	allow_patterns=[".safetensors", ".json", "*.pt"]
	)

	print("=" * 60)
	print("Model files downloaded successfully!")
	print("=" * 60)
	else:
	print("Model files already present.")
	print("=" * 60)

	except Exception as e:
	print("=" * 60)
	print(f"Warning: Could not download model files: {e}")
	print("=" * 60)

	def load_model(checkpoint="kn_cbox", device="cuda"):
	"""Load the TTS model"""
	global MODEL
	try:
	checkpoint_path = f"{_target}/{checkpoint}"
	print(f"Loading model with checkpoint: {checkpoint_path}")
	print(f"Target device: {device}")
	MODEL = ChatterboxTTS.from_dhivehi(
	ckpt_dir=Path(checkpoint_path),
	device=device
	)
	print(f"Model loaded successfully on {device}!")
	except Exception as e:
	print(f"Error loading model: {e}")
	raise e

	def set_seed(seed: int):
	"""Set random seed for reproducibility"""
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	random.seed(seed)
	np.random.seed(seed)

	# Internal implementation without decorator
	def _generate_speech_impl(text,
	reference_audio,
	exaggeration=0.5,
	temperature=0.1,
	cfg_weight=0.5,
	seed=42):
	"""Internal implementation of generate speech"""
	global MODEL

	# Clean the input text
	text = clean_text(text)

	if not text:
	return None, "Please enter some text to generate speech."

	if MODEL is None:
	return None, "Model not loaded. Please check your model paths."

	try:
	# Set seed for reproducibility
	set_seed(seed)

	# Handle reference audio - validate it exists
	audio_prompt_path = None
	if reference_audio and isinstance(reference_audio, str) and reference_audio.strip():
	# Check if file actually exists
	if os.path.exists(reference_audio):
	audio_prompt_path = reference_audio
	print(f"Using reference audio: {audio_prompt_path}")
	else:
	print(f"Reference audio path not found, ignoring: {reference_audio}")

	if not audio_prompt_path:
	print("Generating without reference audio")

	print(f"Generating audio for: {text[:50]}...")

	# Generate audio - handle optional reference audio
	if audio_prompt_path:
	audio = MODEL.generate(
	text=text,
	audio_prompt_path=audio_prompt_path,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	else:
	# Try without reference audio
	try:
	audio = MODEL.generate(
	text=text,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	except TypeError:
	# If the model requires audio_prompt_path, try with empty string
	audio = MODEL.generate(
	text=text,
	audio_prompt_path="",
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	output_path = tmp_file.name

	torchaudio.save(output_path, audio, 24000)

	return output_path, f"Successfully generated speech! Audio length: {audio.shape[1]/24000:.2f} seconds"

	except Exception as e:
	error_msg = f"Error generating speech: {str(e)}"
	print(error_msg)
	return None, error_msg

	# GPU version with decorator
	@spaces.GPU
	def _generate_speech_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
	"""GPU version of generate speech"""
	return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)

	# CPU version without decorator
	def _generate_speech_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
	"""CPU version of generate speech"""
	return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)

	# Router function
	def generate_speech(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
	"""Generate speech from text using voice cloning"""
	if use_gpu:
	return _generate_speech_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
	else:
	return _generate_speech_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)

	def clean_text(text):
	"""Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
	import re

	# Remove newlines at start and end
	text = text.strip('\n\r')

	# Replace multiple spaces with single space
	text = re.sub(r'\s+', ' ', text)

	# Strip leading and trailing spaces
	text = text.strip()

	return text

	def split_sentences(text):
	"""Split text into sentences based on periods, ensuring each sentence is at least 150 characters"""
	# Clean the input text first
	text = clean_text(text)

	# First, split by periods normally
	initial_sentences = []
	current_sentence = ""

	for char in text:
	current_sentence += char
	if char == '.':
	# Add sentence if it's not empty after stripping spaces from both sides
	stripped_sentence = current_sentence.strip()
	if stripped_sentence:
	initial_sentences.append(stripped_sentence)
	current_sentence = ""

	# Add remaining text if any (without period), stripped of spaces from both sides
	stripped_remaining = current_sentence.strip()
	if stripped_remaining:
	initial_sentences.append(stripped_remaining)

	# If we only have one sentence, return it
	if len(initial_sentences) <= 1:
	return initial_sentences

	# Now combine sentences until each is at least 150 characters
	final_sentences = []
	combined_sentence = ""

	for sentence in initial_sentences:
	if combined_sentence:
	combined_sentence += " " + sentence
	else:
	combined_sentence = sentence

	# If combined sentence is >= 150 chars, add it to final list
	if len(combined_sentence) >= 150:
	final_sentences.append(combined_sentence.strip())
	combined_sentence = ""

	# Add any remaining combined sentence (even if < 150 chars)
	if combined_sentence.strip():
	final_sentences.append(combined_sentence.strip())

	return final_sentences

	# Internal implementation without decorator
	def _generate_speech_multi_sentence_impl(text,
	reference_audio,
	exaggeration=0.5,
	temperature=0.1,
	cfg_weight=0.5,
	seed=42,
	use_gpu=True):
	"""Internal implementation of multi-sentence speech generation"""
	global MODEL

	# Clean the input text
	text = clean_text(text)

	if not text:
	yield None, "Please enter some text to generate speech."
	return

	if MODEL is None:
	yield None, "Model not loaded. Please check your model paths."
	return

	# Split text into sentences
	sentences = split_sentences(text)

	# If only one sentence or no periods, use regular method
	if len(sentences) <= 1:
	yield None, "Generating single sentence..."
	result_audio, result_status = generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu)
	yield result_audio, result_status
	return

	try:
	# Set seed for reproducibility
	set_seed(seed)

	# Handle reference audio - validate it exists
	audio_prompt_path = None
	if reference_audio and isinstance(reference_audio, str) and reference_audio.strip():
	# Check if file actually exists
	if os.path.exists(reference_audio):
	audio_prompt_path = reference_audio
	print(f"Using reference audio: {audio_prompt_path}")
	else:
	print(f"Reference audio path not found, ignoring: {reference_audio}")

	if not audio_prompt_path:
	print("Generating without reference audio")

	yield None, f"Starting generation for {len(sentences)} sentences..."
	print(f"Processing {len(sentences)} sentences...")

	all_audio_segments = []
	total_duration = 0

	for i, sentence in enumerate(sentences):
	# Calculate progress percentage
	progress_percent = int((i / len(sentences)) * 90) # Reserve last 10% for combining
	yield None, f"Generating sentence {i+1}/{len(sentences)} ({progress_percent}%): {sentence[:50]}..."

	print(f"Generating audio for sentence {i+1}/{len(sentences)}: {sentence[:50]}...")

	# Generate audio for this sentence
	try:
	if audio_prompt_path:
	audio = MODEL.generate(
	text=sentence,
	audio_prompt_path=audio_prompt_path,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	else:
	# Try without reference audio
	try:
	audio = MODEL.generate(
	text=sentence,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	except TypeError:
	# If the model requires audio_prompt_path, try with empty string
	audio = MODEL.generate(
	text=sentence,
	audio_prompt_path="",
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	except Exception as model_error:
	# If the model fails due to missing reference audio, try with default behavior
	if "reference_voice.wav not found" in str(model_error) or "No reference audio provided" in str(model_error):
	print("Attempting generation without reference audio...")
	# Try different approaches for models that don't support None reference audio
	try:
	# Some models might accept an empty string
	audio = MODEL.generate(
	text=sentence,
	audio_prompt_path="",
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	except:
	# If that fails, try without the audio_prompt_path parameter entirely
	audio = MODEL.generate(
	text=sentence,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)
	else:
	raise model_error

	all_audio_segments.append(audio)
	total_duration += audio.shape[1] / 24000

	# Concatenate all audio segments
	yield None, "Combining audio segments (95%)..."
	print("Combining audio segments...")
	combined_audio = torch.cat(all_audio_segments, dim=1)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	output_path = tmp_file.name

	torchaudio.save(output_path, combined_audio, 24000)
	print("Multi-sentence processing complete!")

	yield output_path, f"Successfully generated speech from {len(sentences)} sentences! Total audio length: {total_duration:.2f} seconds"

	except Exception as e:
	error_msg = f"Error generating multi-sentence speech: {str(e)}"
	print(error_msg)
	yield None, error_msg

	# GPU version with decorator
	@spaces.GPU
	def _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
	"""GPU version of multi-sentence speech generation"""
	for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=True):
	yield result

	# CPU version without decorator
	def _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
	"""CPU version of multi-sentence speech generation"""
	for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=False):
	yield result

	# Router function
	def generate_speech_multi_sentence(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
	"""Generate speech from text with multi-sentence support and progress tracking"""
	if use_gpu:
	for result in _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
	yield result
	else:
	for result in _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
	yield result

	def create_interface():
	"""Create the Gradio interface"""

	# Sample texts in Dhivehi
	sample_texts = [
	"ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
	"""ފުޓްބޯޅަ ސްކޫލްގެ ބިމާއި ގުދަންބަރި ބިމުގައި އިމާރާތް ކުރުމުގެ މަސައްކަތް ހުއްޓާލަން އަންގައިފި...
	Construction work on football school land and warehouse land has been ordered to stop""",
	"ސިވިލް ސާވިސްގެ ހިދުމަތުގެ މުއްދަތު ގުނުމުގައި ކުންފުނިތަކާއި އިދާރާތަކަށް ހިދުމަތްކުރި މުއްދަތު ހިމަނަނީ",
	"""އެ ރަށުގެ ބިން ހިއްކުމާއި ބަނދަރުގެ ނެރު ބަދަލުކުރުމާއި ގޮނޑުދޮށް ހިމާޔަތް ކުރުމުގެ މަސައްކަތް އެމްޓީސީސީއާ މިނިސްޓްރީން ހަވާލުކުރީ މިދިޔަ މަހު ރައީސް އެ ރަށަށް ކުރެއްވި ދަތުރުފުޅުގައި.
	The ministry handed over the land reclamation, replacement of the port canal and beach protection to MTCC during the President's visit to the village last month"""
	]

	with gr.Blocks(title="ChatterboxTTS - Dhivehi Text-to-Speech", css="""
	.textbox1 textarea {
	font-size: 18px !important;
	font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
	line-height: 1.8 !important;
	direction: rtl !important;
	text-align: right !important;
	}
	""") as app:
	gr.Markdown("# 🎤 ChatterboxTTS - Dhivehi Text-to-Speech with Voice Cloning")
	gr.Markdown("Generate natural-sounding Dhivehi speech with voice cloning capabilities.")

	# Row 1: Text input and Reference audio
	with gr.Row():
	text_input = gr.Textbox(
	label="Text to Convert",
	placeholder="Enter Dhivehi text here...",
	lines=6,
	value="""އައްޑޫގައި ވަކިވަކި ކައުންސިލްތައް ހަދަން ފެނޭތޯ ބަލަން ނަގާ ތާރީހީ، ފެންނަ ނުފެންނަ ވޯޓާ ގުޅޭ ބަހުސެއް މާދަމާ ރޭ "މިހާރު" އިން ބާއްވަން ނިންމައިފި.

	އައްޑޫ ސިޓީ ކައުންސިލުގެ ދަށުން އިދާރީ ގޮތުން ހުޅުދު އާއި މީދޫ އަދި ފޭދޫ ވަކިކޮށް. އެ ތިން ރަށުގައި ވަކިވަކި ކައުންސިލުތައް ހެދުމަށް ފެނޭތޯ ބެލުމަށް ތިން ރަށުގެ ރައްޔިތުންގެ މެދުގައި ފެންނަ ނުފެންނަ ވޯޓެއް ނަގަނީ އަންނަ ހޮނިހިރު ދުވަހު.

	ރައްޔިތުންގެ ހިޔާލު ހޯދުމުގެ އާންމު ވޯޓު ނެގުމުގެ ގާނޫނުގެ ދަށުން ނަގާ ފުރަތަމަ ވޯޓާ ގުޅޭގޮތުން "މިހާރު" އިން ބަހުސެއް ބާއްވަން ނިންމާފައިވާއިރު. އެ ބަހުސްގައި ބައިވެރިވެވަޑައިގަންނަވާނީ އައްޑޫގެ އެކި ދާއިރާތަކުގައި ތަޖުރިބާކާރު ބޭފުޅުން.
	""",
	rtl=True,
	elem_classes=["textbox1"]
	)
	reference_audio = gr.Audio(
	label="Reference Voice Audio (optional - for voice cloning)",
	type="filepath",
	sources=["upload", "microphone"],
	value="m2.wav"
	)

	# Row 2: Example buttons
	gr.Markdown("Quick Examples:")
	with gr.Row():
	sample_btn1 = gr.Button("Sample 1", size="sm")
	sample_btn2 = gr.Button("Sample 2", size="sm")
	sample_btn3 = gr.Button("Sample 3", size="sm")
	sample_btn4 = gr.Button("Sample 4", size="sm")

	# Row 2b: Reference Audio buttons
	gr.Markdown("Reference Audio:")
	with gr.Row():
	ref_btn1 = gr.Button("Female 1 (f1.wav)", size="sm")
	ref_btn2 = gr.Button("Female 2 (f2.wav)", size="sm")
	ref_btn3 = gr.Button("Male 1 (m1.wav)", size="sm")
	ref_btn4 = gr.Button("Male 2 (m2.wav)", size="sm")

	# Row 3: Advanced settings
	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	exaggeration = gr.Slider(
	minimum=0.0,
	maximum=5.0,
	value=0.5,
	step=0.1,
	label="Exaggeration",
	info="Controls expressiveness"
	)
	temperature = gr.Slider(
	minimum=0.01,
	maximum=1.0,
	value=0.8,
	step=0.01,
	label="Temperature",
	info="Controls randomness"
	)
	cfg_weight = gr.Slider(
	minimum=0.0,
	maximum=5.0,
	value=0.5,
	step=0.1,
	label="CFG Weight",
	info="Classifier-free guidance weight"
	)
	seed = gr.Slider(
	minimum=0,
	maximum=9999,
	value=42,
	step=1,
	label="Seed",
	info="For reproducible results"
	)
	with gr.Row():
	model_select = gr.Dropdown(
	#choices=["kn_cbox", "f01_cbox"], #f01 upload correct chkpnt
	choices=["kn_cbox"],
	value="kn_cbox",
	label="Model",
	info="Select TTS model"
	)
	device_select = gr.Dropdown(
	choices=["GPU", "CPU"],
	value="GPU",
	label="Device",
	info="Select computation device"
	)
	reload_btn = gr.Button("🔄 Reload Model", size="sm")
	reload_status = gr.Textbox(label="Model Status", value="✅ Model 'kn_cbox' loaded on GPU", interactive=False)

	gr.Markdown("Note: This fine-tune is minimal, so some words may drop or sentences might not complete perfectly. You can experiment with the Advanced Settings to find what works best for your reference audio and to reduce any output issues. This Space uses ZeroGPU for processing, so if your text is long, the GPU might be released before completion, which could cause a timeout. For longer inputs, switch to CPU mode from the Advanced Settings and wait for it to finish. It will run a bit slower, but it should still complete reliably.")

	# Row 4: Generate button
	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")


	# Row 5: Output section
	with gr.Row():
	with gr.Column():
	output_audio = gr.Audio(label="Generated Speech", type="filepath")
	status_message = gr.Textbox(label="Status", interactive=False)

	# Event handlers
	# Default values for advanced settings
	DEFAULT_EXAGGERATION = 0.5
	DEFAULT_TEMPERATURE = 0.8
	DEFAULT_CFG_WEIGHT = 0.5
	DEFAULT_SEED = 42

	def set_sample_text(sample_idx):
	return sample_texts[sample_idx], DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED

	def set_reference_audio(audio_file):
	return audio_file, DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED

	def reload_model_handler(model_name, device_name):
	"""Reload model with selected checkpoint and device"""
	try:
	device = "cuda" if device_name == "GPU" else "cpu"
	load_model(checkpoint=model_name, device=device)
	return f"✅ Model '{model_name}' loaded successfully on {device_name}!"
	except Exception as e:
	return f"❌ Error loading model: {str(e)}"

	sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
	sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
	sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
	sample_btn4.click(lambda: set_sample_text(3), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])

	ref_btn1.click(lambda: set_reference_audio("f1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
	ref_btn2.click(lambda: set_reference_audio("f2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
	ref_btn3.click(lambda: set_reference_audio("m1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
	ref_btn4.click(lambda: set_reference_audio("m2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])

	reload_btn.click(
	fn=reload_model_handler,
	inputs=[model_select, device_select],
	outputs=[reload_status]
	)

	def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed, device_name):
	"""Generate speech with streaming progress updates"""
	use_gpu = (device_name == "GPU")
	# Use the streaming generator
	for result_audio, result_status in generate_speech_multi_sentence(
	text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu
	):
	yield result_audio, result_status

	generate_btn.click(
	fn=generate_with_progress,
	inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select],
	outputs=[output_audio, status_message]
	)

	# Parameter Examples Section
	gr.Markdown("### Examples")
	gr.Markdown("Click any example below to load pre-configured settings:")

	gr.Examples(
	examples=[
	# [text, reference_audio, exaggeration, temperature, cfg_weight, seed, device]
	["""އެއް ދުވަހަކު ސަލާންޖަހާ މީހަކު ޖުހާގެ ގޭގެ ދޮރުމައްޗަށް އަރާ ސަލާން ގޮވާލައިފިއެވެ.

	ސަލާންޖަހާ މީހާ އައިސް ސަލާން ގޮވާލި އިރު ޖުހާ އުޅެނީ ގޭގެ މަތީ ބުރީގައެވެ.

	“ކާކު؟ ކީއްކުރަން؟” ޖުހާ ގޭތެރެއިން ގޮވާލައިފިއެވެ.

	“އައިސްފާނަންތަ ތިރިއަށް؟” ސަލާންޖަހާ މީހާ ބުންޏެވެ.

	އޭނާ އެހެން ބުނުމުން ޖުހާ ތިރިއަށް ގޮސް ސަލާން ޖަހާ މީހާ އާ ބައްދަލު ކޮށްފިއެވެ.

	“ކިހިނެއްވީ؟” ސަލާންޖަހާ މީހާ ކުރެން ޖުހާ އަހައިފިއެވެ.

	“އަހަންނަކީ ވަރަށް ބޮޑު ނިކަމެއްޗެއް، ސަދަގާތެއްގެ ގޮތުން އަހަންނަށް އެހީއެއް ދޭތޯ!” ސަލާންޖަހާ މީހާ ބުންޏެވެ.

	“އާދޭ އެތެރެއަށް.” ޖުހާ ބުންޏެވެ.

	ޖުހާ އެހެން ބުނުމުން ސަލާންޖަހާ މީހާ ގޭތެރެއަށް ވަދެއްޖެއެވެ. ގޭތެރެއަށް ވަނުމުން މަށާއެކީ އަންނާށޭ ކިޔާ ޖުހާ ގޭގެ ސިޑިން މައްޗަށް އަރަން ފަށައިފިއެވެ. ސަލާން ޖަހާ މީހާ ވެސް ޖުހާގެ ފަހަތުން މައްޗަށް ދެއެވެ. މި ހެން ގޮސް އެމީހާ ގޮވައިގެން ގޮސް ގޭގެ ފުރާޅު މައްޗަށް އަރައިފިއެވެ.

	ފުރާޅު މައްޗަށް އެރުމާއެކު ޖުހާ ބުނެފިއެވެ. “މަގޭ އަތަކު ދޭނެ އެއްޗެއް ނެތް.”

	ސަލާންޖާހާމީހާ މިހާ ހިސާބަށް މައްޗަށް އެރުވުމަށް ފަހު ދޭނެ އެއްޗެއް ނެތޭ ޖުހާ ބުނުމުން އޭނާ ވަރަށް ހިތްހަމަ ނުޖެހިއްޖެއެވެ. “ކީއްވެ ތިހެން ތިހެދީ؟ އަހަރެން ދޮރުމަތީގައި ހުއްޓާވެސް ތިޔަހެން ބުނެލެވުނީހެއްނު! ކީއްކުރަން މިހާ ހިސާބަށް އަރުވާފައި ތިހެން ތިބުނީ؟”

	“އެހެން ވިއްޔާ ކީއްވެ’ އަހަރެން ތިރިއަށް ނުބާލާ، ތިޔަ ހޯދަން އުޅުނު އެހީއެއްގެ ވާހަކަ ނޭހީ؟ އެހެން ނަމަ މަށަށްވެސް އެއްޗެއް ނެތޭ ބުނެ ތިރިޔަށް ނުފައިބާ ފަރުޖެއްސުނީހެއްނު” ޖުހާ ސަލާންޖަހާ މީހާ އަށް ޖަވާބު ދިނެވެ.""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"],
	["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "f1.wav", 0.5, 0.8, 0.5, 42, "GPU"],
	["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި.
	A senior customs official told Miharu today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "f2.wav", 0.2, 0.35, 0.4, 42, "GPU"],
	["""ޤައުމަށްޓަކާ ދީނަށްޓަކާ ކެރިގެން ޖިހާދު ކުރާނަމޭ.
	ފައުޅާއި ސިއްރާ އެއްގޮތަށް ހުރިހާ ކަމެއް ގެންދާނަމޭ،
	އަހުރެންގެ މޭ ޤައުމަށްޓަކައި އައްޑަނައަކަށް ދިއްކޮށްލުމީ.
	ފަހުނޭވަޔަށް ދަންދެން މޮޓޯ ކަމުގަައި ޚިޔާރުކުރާނަމޭ،
	އަންނާނެ ތީރެއް އުންޑައެއް ފެނިގެން އަމާޒުވެގެން މެއަށް.
	ގަންނާ ކުރެއްވި ބިރުން ފިލަން ދާމީހަކަށް މަ ނުވާނަމޭ،
	އެޅިފައިވި މަސްއޫލިއްޔަތެއް އުފުލަންދިމާވީމާ ދެނެއް.
	ފެޅިގެން ދެފަޅިއަށް ދިޔަޔަކަސް އެއަކުން މަށެއް ނުރެކޭނަމޭ،
	ކަމަކަށް ގޮވާލީމާ މިޤައުމުގެ ޢިއްޒަތާ އަބުރަށްޓަކައި.
	އަމަށުން ހުރީވިއްޔާ އެތާ އެކަކަށް މަވެސް ހުންނާނަމޭ،
	މިނިވަންކަމާ އެކުވެރިކަމާ ހަމަހަމަ ކަމަށް ތަރުހީބުދީ.
	ހިނިތުންވެ ތިބެ ދީނީ އުޚުއްވަތް ފެތުރުމަށް މަ ގޮވާނަމޭ،
	އަޚުނާއި އުޚުތުންނަށް އެދޭނީ ލާބަޔާ މަންފާތަކޭ.
	ބަޚުތާއިމެދު ނުރުހުންވެގެން ޝަކުވާތަކެއް ނުކުރާނަމޭ،
	އަނެކުންގެ ކުށްތައް ހޯދުމީ ނަފުސުގެ މަތިން ނައްތައި ހަނދާން.""", "m3.mp3", 0.5, 0.8, 0.5, 42, "GPU"],
	["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި.
	A senior customs official told Mihaaru today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "m1.wav", 0.2, 0.35, 0.4, 42, "GPU"],
	["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"],
	],
	inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select],
	outputs=[output_audio, status_message],
	fn=generate_with_progress,
	label="Preset Configurations",
	examples_per_page=8,
	cache_examples="lazy"
	)

	# Instructions
	with gr.Accordion("Tips", open=False):
	gr.Markdown("""
	### General Use (TTS and Voice Agents):
	- The default settings (exaggeration=0.5, cfg=0.5) work well for most prompts.
	- If the reference speaker has a fast speaking style, lowering cfg to around 0.3 can improve pacing.

	### Expressive or Dramatic Speech:
	- Try lower cfg values (e.g. ~0.3) and increase exaggeration to around 0.7 or higher.
	- Higher exaggeration tends to speed up speech; reducing cfg helps compensate with slower, more deliberate pacing.

	### Language Transfer Notes:
	- Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language.
	- To mitigate this, set the CFG weight to 0.

	### Additional Tips:
	- For best voice cloning results, use clear audio with minimal background noise
	- The reference audio should be 3-10 seconds long
	- Use the same seed value for reproducible results
	""")

	return app

	if __name__ == "__main__":
	# Step 1: Download model files first
	print("\nStarting ChatterboxTTS Dhivehi Application")
	print("=" * 60)
	download_model()

	# Step 2: Load the default model with GPU
	print("\nLoading default model...")
	print("=" * 60)
	try:
	load_model(checkpoint="kn_cbox", device="cuda")
	print("Default model loaded successfully!")
	except Exception as e:
	print(f"Warning: Could not load default model: {e}")
	print("You can manually load the model using the 'Reload Model' button in the interface.")
	print("=" * 60)

	# Step 3: Create and launch the interface
	print("\nCreating Gradio interface...")
	app = create_interface()

	# Step 4: Launch with public sharing and authentication if needed
	print("Launching application...")
	print("=" * 60)
	app.launch()