alakxender's picture
t
088b524
raw
history blame
36 kB
import spaces
from pathlib import Path
import os
import sys
# Add current directory to Python path for HuggingFace Spaces
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from chatterbox.tts import ChatterboxTTS
import torchaudio
import torch
import random
import numpy as np
import gradio as gr
import tempfile
import chatterbox_dhivehi
import warnings
warnings.filterwarnings("ignore")
chatterbox_dhivehi.extend_dhivehi()
# Global variables
MODEL = None
_target = Path.home() / ".chatterbox-tts-dhivehi"
def download_model():
"""Download model files from HuggingFace if not already present"""
try:
from huggingface_hub import snapshot_download
print("=" * 60)
print("Checking model files...")
print(f"Target directory: {_target}")
if not (_target.exists() and any(_target.rglob("*"))):
print("Model files not found. Starting download...")
print("This may take a few minutes on first run.")
print("=" * 60)
snapshot_download(
repo_id="alakxender/chatterbox-tts-dhivehi",
local_dir=str(_target),
local_dir_use_symlinks=False,
resume_download=True,
force_download=True,
allow_patterns=["*.safetensors", "*.json", "*.pt"]
)
print("=" * 60)
print("Model files downloaded successfully!")
print("=" * 60)
else:
print("Model files already present.")
print("=" * 60)
except Exception as e:
print("=" * 60)
print(f"Warning: Could not download model files: {e}")
print("=" * 60)
def load_model(checkpoint="kn_cbox", device="cuda"):
"""Load the TTS model"""
global MODEL
try:
checkpoint_path = f"{_target}/{checkpoint}"
print(f"Loading model with checkpoint: {checkpoint_path}")
print(f"Target device: {device}")
MODEL = ChatterboxTTS.from_dhivehi(
ckpt_dir=Path(checkpoint_path),
device=device
)
print(f"Model loaded successfully on {device}!")
except Exception as e:
print(f"Error loading model: {e}")
raise e
def set_seed(seed: int):
"""Set random seed for reproducibility"""
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
# Internal implementation without decorator
def _generate_speech_impl(text,
reference_audio,
exaggeration=0.5,
temperature=0.1,
cfg_weight=0.5,
seed=42):
"""Internal implementation of generate speech"""
global MODEL
# Clean the input text
text = clean_text(text)
if not text:
return None, "Please enter some text to generate speech."
if MODEL is None:
return None, "Model not loaded. Please check your model paths."
try:
# Set seed for reproducibility
set_seed(seed)
# Handle reference audio - validate it exists
audio_prompt_path = None
if reference_audio and isinstance(reference_audio, str) and reference_audio.strip():
# Check if file actually exists
if os.path.exists(reference_audio):
audio_prompt_path = reference_audio
print(f"Using reference audio: {audio_prompt_path}")
else:
print(f"Reference audio path not found, ignoring: {reference_audio}")
if not audio_prompt_path:
print("Generating without reference audio")
print(f"Generating audio for: {text[:50]}...")
# Generate audio - handle optional reference audio
if audio_prompt_path:
audio = MODEL.generate(
text=text,
audio_prompt_path=audio_prompt_path,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
else:
# Try without reference audio
try:
audio = MODEL.generate(
text=text,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
except TypeError:
# If the model requires audio_prompt_path, try with empty string
audio = MODEL.generate(
text=text,
audio_prompt_path="",
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
output_path = tmp_file.name
torchaudio.save(output_path, audio, 24000)
return output_path, f"Successfully generated speech! Audio length: {audio.shape[1]/24000:.2f} seconds"
except Exception as e:
error_msg = f"Error generating speech: {str(e)}"
print(error_msg)
return None, error_msg
# GPU version with decorator
@spaces.GPU
def _generate_speech_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
"""GPU version of generate speech"""
return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
# CPU version without decorator
def _generate_speech_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
"""CPU version of generate speech"""
return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
# Router function
def generate_speech(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
"""Generate speech from text using voice cloning"""
if use_gpu:
return _generate_speech_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
else:
return _generate_speech_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
def clean_text(text):
"""Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
import re
# Remove newlines at start and end
text = text.strip('\n\r')
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Strip leading and trailing spaces
text = text.strip()
return text
def split_sentences(text):
"""Split text into sentences based on periods, ensuring each sentence is at least 150 characters"""
# Clean the input text first
text = clean_text(text)
# First, split by periods normally
initial_sentences = []
current_sentence = ""
for char in text:
current_sentence += char
if char == '.':
# Add sentence if it's not empty after stripping spaces from both sides
stripped_sentence = current_sentence.strip()
if stripped_sentence:
initial_sentences.append(stripped_sentence)
current_sentence = ""
# Add remaining text if any (without period), stripped of spaces from both sides
stripped_remaining = current_sentence.strip()
if stripped_remaining:
initial_sentences.append(stripped_remaining)
# If we only have one sentence, return it
if len(initial_sentences) <= 1:
return initial_sentences
# Now combine sentences until each is at least 150 characters
final_sentences = []
combined_sentence = ""
for sentence in initial_sentences:
if combined_sentence:
combined_sentence += " " + sentence
else:
combined_sentence = sentence
# If combined sentence is >= 150 chars, add it to final list
if len(combined_sentence) >= 150:
final_sentences.append(combined_sentence.strip())
combined_sentence = ""
# Add any remaining combined sentence (even if < 150 chars)
if combined_sentence.strip():
final_sentences.append(combined_sentence.strip())
return final_sentences
# Internal implementation without decorator
def _generate_speech_multi_sentence_impl(text,
reference_audio,
exaggeration=0.5,
temperature=0.1,
cfg_weight=0.5,
seed=42,
use_gpu=True):
"""Internal implementation of multi-sentence speech generation"""
global MODEL
# Clean the input text
text = clean_text(text)
if not text:
yield None, "Please enter some text to generate speech."
return
if MODEL is None:
yield None, "Model not loaded. Please check your model paths."
return
# Split text into sentences
sentences = split_sentences(text)
# If only one sentence or no periods, use regular method
if len(sentences) <= 1:
yield None, "Generating single sentence..."
result_audio, result_status = generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu)
yield result_audio, result_status
return
try:
# Set seed for reproducibility
set_seed(seed)
# Handle reference audio - validate it exists
audio_prompt_path = None
if reference_audio and isinstance(reference_audio, str) and reference_audio.strip():
# Check if file actually exists
if os.path.exists(reference_audio):
audio_prompt_path = reference_audio
print(f"Using reference audio: {audio_prompt_path}")
else:
print(f"Reference audio path not found, ignoring: {reference_audio}")
if not audio_prompt_path:
print("Generating without reference audio")
yield None, f"Starting generation for {len(sentences)} sentences..."
print(f"Processing {len(sentences)} sentences...")
all_audio_segments = []
total_duration = 0
for i, sentence in enumerate(sentences):
# Calculate progress percentage
progress_percent = int((i / len(sentences)) * 90) # Reserve last 10% for combining
yield None, f"Generating sentence {i+1}/{len(sentences)} ({progress_percent}%): {sentence[:50]}..."
print(f"Generating audio for sentence {i+1}/{len(sentences)}: {sentence[:50]}...")
# Generate audio for this sentence
try:
if audio_prompt_path:
audio = MODEL.generate(
text=sentence,
audio_prompt_path=audio_prompt_path,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
else:
# Try without reference audio
try:
audio = MODEL.generate(
text=sentence,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
except TypeError:
# If the model requires audio_prompt_path, try with empty string
audio = MODEL.generate(
text=sentence,
audio_prompt_path="",
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
except Exception as model_error:
# If the model fails due to missing reference audio, try with default behavior
if "reference_voice.wav not found" in str(model_error) or "No reference audio provided" in str(model_error):
print("Attempting generation without reference audio...")
# Try different approaches for models that don't support None reference audio
try:
# Some models might accept an empty string
audio = MODEL.generate(
text=sentence,
audio_prompt_path="",
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
except:
# If that fails, try without the audio_prompt_path parameter entirely
audio = MODEL.generate(
text=sentence,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
else:
raise model_error
all_audio_segments.append(audio)
total_duration += audio.shape[1] / 24000
# Concatenate all audio segments
yield None, "Combining audio segments (95%)..."
print("Combining audio segments...")
combined_audio = torch.cat(all_audio_segments, dim=1)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
output_path = tmp_file.name
torchaudio.save(output_path, combined_audio, 24000)
print("Multi-sentence processing complete!")
yield output_path, f"Successfully generated speech from {len(sentences)} sentences! Total audio length: {total_duration:.2f} seconds"
except Exception as e:
error_msg = f"Error generating multi-sentence speech: {str(e)}"
print(error_msg)
yield None, error_msg
# GPU version with decorator
@spaces.GPU
def _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
"""GPU version of multi-sentence speech generation"""
for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=True):
yield result
# CPU version without decorator
def _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
"""CPU version of multi-sentence speech generation"""
for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=False):
yield result
# Router function
def generate_speech_multi_sentence(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
"""Generate speech from text with multi-sentence support and progress tracking"""
if use_gpu:
for result in _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
yield result
else:
for result in _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
yield result
def create_interface():
"""Create the Gradio interface"""
# Sample texts in Dhivehi
sample_texts = [
"ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
"""ފުޓްބޯޅަ ސްކޫލްގެ ބިމާއި ގުދަންބަރި ބިމުގައި އިމާރާތް ކުރުމުގެ މަސައްކަތް ހުއްޓާލަން އަންގައިފި...
Construction work on football school land and warehouse land has been ordered to stop""",
"ސިވިލް ސާވިސްގެ ހިދުމަތުގެ މުއްދަތު ގުނުމުގައި ކުންފުނިތަކާއި އިދާރާތަކަށް ހިދުމަތްކުރި މުއްދަތު ހިމަނަނީ",
"""އެ ރަށުގެ ބިން ހިއްކުމާއި ބަނދަރުގެ ނެރު ބަދަލުކުރުމާއި ގޮނޑުދޮށް ހިމާޔަތް ކުރުމުގެ މަސައްކަތް އެމްޓީސީސީއާ މިނިސްޓްރީން ހަވާލުކުރީ މިދިޔަ މަހު ރައީސް އެ ރަށަށް ކުރެއްވި ދަތުރުފުޅުގައި.
The ministry handed over the land reclamation, replacement of the port canal and beach protection to MTCC during the President's visit to the village last month"""
]
with gr.Blocks(title="ChatterboxTTS - Dhivehi Text-to-Speech", css="""
.textbox1 textarea {
font-size: 18px !important;
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
line-height: 1.8 !important;
direction: rtl !important;
text-align: right !important;
}
""") as app:
gr.Markdown("# 🎤 ChatterboxTTS - Dhivehi Text-to-Speech with Voice Cloning")
gr.Markdown("Generate natural-sounding Dhivehi speech with voice cloning capabilities.")
# Row 1: Text input and Reference audio
with gr.Row():
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter Dhivehi text here...",
lines=6,
value="""އައްޑޫގައި ވަކިވަކި ކައުންސިލްތައް ހަދަން ފެނޭތޯ ބަލަން ނަގާ ތާރީހީ، ފެންނަ ނުފެންނަ ވޯޓާ ގުޅޭ ބަހުސެއް މާދަމާ ރޭ "މިހާރު" އިން ބާއްވަން ނިންމައިފި.
އައްޑޫ ސިޓީ ކައުންސިލުގެ ދަށުން އިދާރީ ގޮތުން ހުޅުދު އާއި މީދޫ އަދި ފޭދޫ ވަކިކޮށް. އެ ތިން ރަށުގައި ވަކިވަކި ކައުންސިލުތައް ހެދުމަށް ފެނޭތޯ ބެލުމަށް ތިން ރަށުގެ ރައްޔިތުންގެ މެދުގައި ފެންނަ ނުފެންނަ ވޯޓެއް ނަގަނީ އަންނަ ހޮނިހިރު ދުވަހު.
ރައްޔިތުންގެ ހިޔާލު ހޯދުމުގެ އާންމު ވޯޓު ނެގުމުގެ ގާނޫނުގެ ދަށުން ނަގާ ފުރަތަމަ ވޯޓާ ގުޅޭގޮތުން "މިހާރު" އިން ބަހުސެއް ބާއްވަން ނިންމާފައިވާއިރު. އެ ބަހުސްގައި ބައިވެރިވެވަޑައިގަންނަވާނީ އައްޑޫގެ އެކި ދާއިރާތަކުގައި ތަޖުރިބާކާރު ބޭފުޅުން.
""",
rtl=True,
elem_classes=["textbox1"]
)
reference_audio = gr.Audio(
label="Reference Voice Audio (optional - for voice cloning)",
type="filepath",
sources=["upload", "microphone"],
value="m2.wav"
)
# Row 2: Example buttons
gr.Markdown("**Quick Examples:**")
with gr.Row():
sample_btn1 = gr.Button("Sample 1", size="sm")
sample_btn2 = gr.Button("Sample 2", size="sm")
sample_btn3 = gr.Button("Sample 3", size="sm")
sample_btn4 = gr.Button("Sample 4", size="sm")
# Row 2b: Reference Audio buttons
gr.Markdown("**Reference Audio:**")
with gr.Row():
ref_btn1 = gr.Button("Female 1 (f1.wav)", size="sm")
ref_btn2 = gr.Button("Female 2 (f2.wav)", size="sm")
ref_btn3 = gr.Button("Male 1 (m1.wav)", size="sm")
ref_btn4 = gr.Button("Male 2 (m2.wav)", size="sm")
# Row 3: Advanced settings
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
exaggeration = gr.Slider(
minimum=0.0,
maximum=5.0,
value=0.5,
step=0.1,
label="Exaggeration",
info="Controls expressiveness"
)
temperature = gr.Slider(
minimum=0.01,
maximum=1.0,
value=0.8,
step=0.01,
label="Temperature",
info="Controls randomness"
)
cfg_weight = gr.Slider(
minimum=0.0,
maximum=5.0,
value=0.5,
step=0.1,
label="CFG Weight",
info="Classifier-free guidance weight"
)
seed = gr.Slider(
minimum=0,
maximum=9999,
value=42,
step=1,
label="Seed",
info="For reproducible results"
)
with gr.Row():
model_select = gr.Dropdown(
#choices=["kn_cbox", "f01_cbox"], #f01 upload correct chkpnt
choices=["kn_cbox"],
value="kn_cbox",
label="Model",
info="Select TTS model"
)
device_select = gr.Dropdown(
choices=["GPU", "CPU"],
value="GPU",
label="Device",
info="Select computation device"
)
reload_btn = gr.Button("🔄 Reload Model", size="sm")
reload_status = gr.Textbox(label="Model Status", value="✅ Model 'kn_cbox' loaded on GPU", interactive=False)
gr.Markdown("**Note:** This fine-tune is minimal, so some words may drop or sentences might not complete perfectly. You can experiment with the Advanced Settings to find what works best for your reference audio and to reduce any output issues. This Space uses ZeroGPU for processing, so if your text is long, the GPU might be released before completion, which could cause a timeout. For longer inputs, switch to CPU mode from the Advanced Settings and wait for it to finish. It will run a bit slower, but it should still complete reliably.")
# Row 4: Generate button
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
# Row 5: Output section
with gr.Row():
with gr.Column():
output_audio = gr.Audio(label="Generated Speech", type="filepath")
status_message = gr.Textbox(label="Status", interactive=False)
# Event handlers
# Default values for advanced settings
DEFAULT_EXAGGERATION = 0.5
DEFAULT_TEMPERATURE = 0.8
DEFAULT_CFG_WEIGHT = 0.5
DEFAULT_SEED = 42
def set_sample_text(sample_idx):
return sample_texts[sample_idx], DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED
def set_reference_audio(audio_file):
return audio_file, DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED
def reload_model_handler(model_name, device_name):
"""Reload model with selected checkpoint and device"""
try:
device = "cuda" if device_name == "GPU" else "cpu"
load_model(checkpoint=model_name, device=device)
return f"✅ Model '{model_name}' loaded successfully on {device_name}!"
except Exception as e:
return f"❌ Error loading model: {str(e)}"
sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
sample_btn4.click(lambda: set_sample_text(3), outputs=[text_input, exaggeration, temperature, cfg_weight, seed])
ref_btn1.click(lambda: set_reference_audio("f1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
ref_btn2.click(lambda: set_reference_audio("f2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
ref_btn3.click(lambda: set_reference_audio("m1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
ref_btn4.click(lambda: set_reference_audio("m2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed])
reload_btn.click(
fn=reload_model_handler,
inputs=[model_select, device_select],
outputs=[reload_status]
)
def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed, device_name):
"""Generate speech with streaming progress updates"""
use_gpu = (device_name == "GPU")
# Use the streaming generator
for result_audio, result_status in generate_speech_multi_sentence(
text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu
):
yield result_audio, result_status
generate_btn.click(
fn=generate_with_progress,
inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select],
outputs=[output_audio, status_message]
)
# Parameter Examples Section
gr.Markdown("### Examples")
gr.Markdown("Click any example below to load pre-configured settings:")
gr.Examples(
examples=[
# [text, reference_audio, exaggeration, temperature, cfg_weight, seed, device]
["""އެއް ދުވަހަކު ސަލާންޖަހާ މީހަކު ޖުހާގެ ގޭގެ ދޮރުމައްޗަށް އަރާ ސަލާން ގޮވާލައިފިއެވެ.
ސަލާންޖަހާ މީހާ އައިސް ސަލާން ގޮވާލި އިރު ޖުހާ އުޅެނީ ގޭގެ މަތީ ބުރީގައެވެ.
“ކާކު؟ ކީއްކުރަން؟” ޖުހާ ގޭތެރެއިން ގޮވާލައިފިއެވެ.
“އައިސްފާނަންތަ ތިރިއަށް؟” ސަލާންޖަހާ މީހާ ބުންޏެވެ.
އޭނާ އެހެން ބުނުމުން ޖުހާ ތިރިއަށް ގޮސް ސަލާން ޖަހާ މީހާ އާ ބައްދަލު ކޮށްފިއެވެ.
“ކިހިނެއްވީ؟” ސަލާންޖަހާ މީހާ ކުރެން ޖުހާ އަހައިފިއެވެ.
“އަހަންނަކީ ވަރަށް ބޮޑު ނިކަމެއްޗެއް، ސަދަގާތެއްގެ ގޮތުން އަހަންނަށް އެހީއެއް ދޭތޯ!” ސަލާންޖަހާ މީހާ ބުންޏެވެ.
“އާދޭ އެތެރެއަށް.” ޖުހާ ބުންޏެވެ.
ޖުހާ އެހެން ބުނުމުން ސަލާންޖަހާ މީހާ ގޭތެރެއަށް ވަދެއްޖެއެވެ. ގޭތެރެއަށް ވަނުމުން މަށާއެކީ އަންނާށޭ ކިޔާ ޖުހާ ގޭގެ ސިޑިން މައްޗަށް އަރަން ފަށައިފިއެވެ. ސަލާން ޖަހާ މީހާ ވެސް ޖުހާގެ ފަހަތުން މައްޗަށް ދެއެވެ. މި ހެން ގޮސް އެމީހާ ގޮވައިގެން ގޮސް ގޭގެ ފުރާޅު މައްޗަށް އަރައިފިއެވެ.
ފުރާޅު މައްޗަށް އެރުމާއެކު ޖުހާ ބުނެފިއެވެ. “މަގޭ އަތަކު ދޭނެ އެއްޗެއް ނެތް.”
ސަލާންޖާހާމީހާ މިހާ ހިސާބަށް މައްޗަށް އެރުވުމަށް ފަހު ދޭނެ އެއްޗެއް ނެތޭ ޖުހާ ބުނުމުން އޭނާ ވަރަށް ހިތްހަމަ ނުޖެހިއްޖެއެވެ. “ކީއްވެ ތިހެން ތިހެދީ؟ އަހަރެން ދޮރުމަތީގައި ހުއްޓާވެސް ތިޔަހެން ބުނެލެވުނީހެއްނު! ކީއްކުރަން މިހާ ހިސާބަށް އަރުވާފައި ތިހެން ތިބުނީ؟”
“އެހެން ވިއްޔާ ކީއްވެ’ އަހަރެން ތިރިއަށް ނުބާލާ، ތިޔަ ހޯދަން އުޅުނު އެހީއެއްގެ ވާހަކަ ނޭހީ؟ އެހެން ނަމަ މަށަށްވެސް އެއްޗެއް ނެތޭ ބުނެ ތިރިޔަށް ނުފައިބާ ފަރުޖެއްސުނީހެއްނު” ޖުހާ ސަލާންޖަހާ މީހާ އަށް ޖަވާބު ދިނެވެ.""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"],
["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "f1.wav", 0.5, 0.8, 0.5, 42, "GPU"],
["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި.
A senior customs official told Miharu today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "f2.wav", 0.2, 0.35, 0.4, 42, "GPU"],
["""ޤައުމަށްޓަކާ ދީނަށްޓަކާ ކެރިގެން ޖިހާދު ކުރާނަމޭ.
ފައުޅާއި ސިއްރާ އެއްގޮތަށް ހުރިހާ ކަމެއް ގެންދާނަމޭ،
އަހުރެންގެ މޭ ޤައުމަށްޓަކައި އައްޑަނައަކަށް ދިއްކޮށްލުމީ.
ފަހުނޭވަޔަށް ދަންދެން މޮޓޯ ކަމުގަައި ޚިޔާރުކުރާނަމޭ،
އަންނާނެ ތީރެއް އުންޑައެއް ފެނިގެން އަމާޒުވެގެން މެއަށް.
ގަންނާ ކުރެއްވި ބިރުން ފިލަން ދާމީހަކަށް މަ ނުވާނަމޭ،
އެޅިފައިވި މަސްއޫލިއްޔަތެއް އުފުލަންދިމާވީމާ ދެނެއް.
ފެޅިގެން ދެފަޅިއަށް ދިޔަޔަކަސް އެއަކުން މަށެއް ނުރެކޭނަމޭ،
ކަމަކަށް ގޮވާލީމާ މިޤައުމުގެ ޢިއްޒަތާ އަބުރަށްޓަކައި.
އަމަށުން ހުރީވިއްޔާ އެތާ އެކަކަށް މަވެސް ހުންނާނަމޭ،
މިނިވަންކަމާ އެކުވެރިކަމާ ހަމަހަމަ ކަމަށް ތަރުހީބުދީ.
ހިނިތުންވެ ތިބެ ދީނީ އުޚުއްވަތް ފެތުރުމަށް މަ ގޮވާނަމޭ،
އަޚުނާއި އުޚުތުންނަށް އެދޭނީ ލާބަޔާ މަންފާތަކޭ.
ބަޚުތާއިމެދު ނުރުހުންވެގެން ޝަކުވާތަކެއް ނުކުރާނަމޭ،
އަނެކުންގެ ކުށްތައް ހޯދުމީ ނަފުސުގެ މަތިން ނައްތައި ހަނދާން.""", "m3.mp3", 0.5, 0.8, 0.5, 42, "GPU"],
["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި.
A senior customs official told Mihaaru today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "m1.wav", 0.2, 0.35, 0.4, 42, "GPU"],
["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"],
],
inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select],
outputs=[output_audio, status_message],
fn=generate_with_progress,
label="Preset Configurations",
examples_per_page=8,
cache_examples="lazy"
)
# Instructions
with gr.Accordion("Tips", open=False):
gr.Markdown("""
### General Use (TTS and Voice Agents):
- The default settings (exaggeration=0.5, cfg=0.5) work well for most prompts.
- If the reference speaker has a fast speaking style, lowering cfg to around 0.3 can improve pacing.
### Expressive or Dramatic Speech:
- Try lower cfg values (e.g. ~0.3) and increase exaggeration to around 0.7 or higher.
- Higher exaggeration tends to speed up speech; reducing cfg helps compensate with slower, more deliberate pacing.
### Language Transfer Notes:
- Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language.
- To mitigate this, set the CFG weight to 0.
### Additional Tips:
- For best voice cloning results, use clear audio with minimal background noise
- The reference audio should be 3-10 seconds long
- Use the same seed value for reproducible results
""")
return app
if __name__ == "__main__":
# Step 1: Download model files first
print("\nStarting ChatterboxTTS Dhivehi Application")
print("=" * 60)
download_model()
# Step 2: Load the default model with GPU
print("\nLoading default model...")
print("=" * 60)
try:
load_model(checkpoint="kn_cbox", device="cuda")
print("Default model loaded successfully!")
except Exception as e:
print(f"Warning: Could not load default model: {e}")
print("You can manually load the model using the 'Reload Model' button in the interface.")
print("=" * 60)
# Step 3: Create and launch the interface
print("\nCreating Gradio interface...")
app = create_interface()
# Step 4: Launch with public sharing and authentication if needed
print("Launching application...")
print("=" * 60)
app.launch()