Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| from pathlib import Path | |
| import os | |
| import sys | |
| # Add current directory to Python path for HuggingFace Spaces | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from chatterbox.tts import ChatterboxTTS | |
| import torchaudio | |
| import torch | |
| import random | |
| import numpy as np | |
| import gradio as gr | |
| import tempfile | |
| import chatterbox_dhivehi | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| chatterbox_dhivehi.extend_dhivehi() | |
| # Global variables | |
| MODEL = None | |
| _target = Path.home() / ".chatterbox-tts-dhivehi" | |
| def download_model(): | |
| """Download model files from HuggingFace if not already present""" | |
| try: | |
| from huggingface_hub import snapshot_download | |
| print("=" * 60) | |
| print("Checking model files...") | |
| print(f"Target directory: {_target}") | |
| if not (_target.exists() and any(_target.rglob("*"))): | |
| print("Model files not found. Starting download...") | |
| print("This may take a few minutes on first run.") | |
| print("=" * 60) | |
| snapshot_download( | |
| repo_id="alakxender/chatterbox-tts-dhivehi", | |
| local_dir=str(_target), | |
| local_dir_use_symlinks=False, | |
| resume_download=True, | |
| force_download=True, | |
| allow_patterns=["*.safetensors", "*.json", "*.pt"] | |
| ) | |
| print("=" * 60) | |
| print("Model files downloaded successfully!") | |
| print("=" * 60) | |
| else: | |
| print("Model files already present.") | |
| print("=" * 60) | |
| except Exception as e: | |
| print("=" * 60) | |
| print(f"Warning: Could not download model files: {e}") | |
| print("=" * 60) | |
| def load_model(checkpoint="kn_cbox", device="cuda"): | |
| """Load the TTS model""" | |
| global MODEL | |
| try: | |
| checkpoint_path = f"{_target}/{checkpoint}" | |
| print(f"Loading model with checkpoint: {checkpoint_path}") | |
| print(f"Target device: {device}") | |
| MODEL = ChatterboxTTS.from_dhivehi( | |
| ckpt_dir=Path(checkpoint_path), | |
| device=device | |
| ) | |
| print(f"Model loaded successfully on {device}!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise e | |
| def set_seed(seed: int): | |
| """Set random seed for reproducibility""" | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| # Internal implementation without decorator | |
| def _generate_speech_impl(text, | |
| reference_audio, | |
| exaggeration=0.5, | |
| temperature=0.1, | |
| cfg_weight=0.5, | |
| seed=42): | |
| """Internal implementation of generate speech""" | |
| global MODEL | |
| # Clean the input text | |
| text = clean_text(text) | |
| if not text: | |
| return None, "Please enter some text to generate speech." | |
| if MODEL is None: | |
| return None, "Model not loaded. Please check your model paths." | |
| try: | |
| # Set seed for reproducibility | |
| set_seed(seed) | |
| # Handle reference audio - validate it exists | |
| audio_prompt_path = None | |
| if reference_audio and isinstance(reference_audio, str) and reference_audio.strip(): | |
| # Check if file actually exists | |
| if os.path.exists(reference_audio): | |
| audio_prompt_path = reference_audio | |
| print(f"Using reference audio: {audio_prompt_path}") | |
| else: | |
| print(f"Reference audio path not found, ignoring: {reference_audio}") | |
| if not audio_prompt_path: | |
| print("Generating without reference audio") | |
| print(f"Generating audio for: {text[:50]}...") | |
| # Generate audio - handle optional reference audio | |
| if audio_prompt_path: | |
| audio = MODEL.generate( | |
| text=text, | |
| audio_prompt_path=audio_prompt_path, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| else: | |
| # Try without reference audio | |
| try: | |
| audio = MODEL.generate( | |
| text=text, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| except TypeError: | |
| # If the model requires audio_prompt_path, try with empty string | |
| audio = MODEL.generate( | |
| text=text, | |
| audio_prompt_path="", | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| output_path = tmp_file.name | |
| torchaudio.save(output_path, audio, 24000) | |
| return output_path, f"Successfully generated speech! Audio length: {audio.shape[1]/24000:.2f} seconds" | |
| except Exception as e: | |
| error_msg = f"Error generating speech: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # GPU version with decorator | |
| def _generate_speech_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42): | |
| """GPU version of generate speech""" | |
| return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed) | |
| # CPU version without decorator | |
| def _generate_speech_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42): | |
| """CPU version of generate speech""" | |
| return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed) | |
| # Router function | |
| def generate_speech(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True): | |
| """Generate speech from text using voice cloning""" | |
| if use_gpu: | |
| return _generate_speech_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed) | |
| else: | |
| return _generate_speech_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed) | |
| def clean_text(text): | |
| """Clean text by removing newlines at start/end, double spaces, and extra whitespace""" | |
| import re | |
| # Remove newlines at start and end | |
| text = text.strip('\n\r') | |
| # Replace multiple spaces with single space | |
| text = re.sub(r'\s+', ' ', text) | |
| # Strip leading and trailing spaces | |
| text = text.strip() | |
| return text | |
| def split_sentences(text): | |
| """Split text into sentences based on periods, ensuring each sentence is at least 150 characters""" | |
| # Clean the input text first | |
| text = clean_text(text) | |
| # First, split by periods normally | |
| initial_sentences = [] | |
| current_sentence = "" | |
| for char in text: | |
| current_sentence += char | |
| if char == '.': | |
| # Add sentence if it's not empty after stripping spaces from both sides | |
| stripped_sentence = current_sentence.strip() | |
| if stripped_sentence: | |
| initial_sentences.append(stripped_sentence) | |
| current_sentence = "" | |
| # Add remaining text if any (without period), stripped of spaces from both sides | |
| stripped_remaining = current_sentence.strip() | |
| if stripped_remaining: | |
| initial_sentences.append(stripped_remaining) | |
| # If we only have one sentence, return it | |
| if len(initial_sentences) <= 1: | |
| return initial_sentences | |
| # Now combine sentences until each is at least 150 characters | |
| final_sentences = [] | |
| combined_sentence = "" | |
| for sentence in initial_sentences: | |
| if combined_sentence: | |
| combined_sentence += " " + sentence | |
| else: | |
| combined_sentence = sentence | |
| # If combined sentence is >= 150 chars, add it to final list | |
| if len(combined_sentence) >= 150: | |
| final_sentences.append(combined_sentence.strip()) | |
| combined_sentence = "" | |
| # Add any remaining combined sentence (even if < 150 chars) | |
| if combined_sentence.strip(): | |
| final_sentences.append(combined_sentence.strip()) | |
| return final_sentences | |
| # Internal implementation without decorator | |
| def _generate_speech_multi_sentence_impl(text, | |
| reference_audio, | |
| exaggeration=0.5, | |
| temperature=0.1, | |
| cfg_weight=0.5, | |
| seed=42, | |
| use_gpu=True): | |
| """Internal implementation of multi-sentence speech generation""" | |
| global MODEL | |
| # Clean the input text | |
| text = clean_text(text) | |
| if not text: | |
| yield None, "Please enter some text to generate speech." | |
| return | |
| if MODEL is None: | |
| yield None, "Model not loaded. Please check your model paths." | |
| return | |
| # Split text into sentences | |
| sentences = split_sentences(text) | |
| # If only one sentence or no periods, use regular method | |
| if len(sentences) <= 1: | |
| yield None, "Generating single sentence..." | |
| result_audio, result_status = generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu) | |
| yield result_audio, result_status | |
| return | |
| try: | |
| # Set seed for reproducibility | |
| set_seed(seed) | |
| # Handle reference audio - validate it exists | |
| audio_prompt_path = None | |
| if reference_audio and isinstance(reference_audio, str) and reference_audio.strip(): | |
| # Check if file actually exists | |
| if os.path.exists(reference_audio): | |
| audio_prompt_path = reference_audio | |
| print(f"Using reference audio: {audio_prompt_path}") | |
| else: | |
| print(f"Reference audio path not found, ignoring: {reference_audio}") | |
| if not audio_prompt_path: | |
| print("Generating without reference audio") | |
| yield None, f"Starting generation for {len(sentences)} sentences..." | |
| print(f"Processing {len(sentences)} sentences...") | |
| all_audio_segments = [] | |
| total_duration = 0 | |
| for i, sentence in enumerate(sentences): | |
| # Calculate progress percentage | |
| progress_percent = int((i / len(sentences)) * 90) # Reserve last 10% for combining | |
| yield None, f"Generating sentence {i+1}/{len(sentences)} ({progress_percent}%): {sentence[:50]}..." | |
| print(f"Generating audio for sentence {i+1}/{len(sentences)}: {sentence[:50]}...") | |
| # Generate audio for this sentence | |
| try: | |
| if audio_prompt_path: | |
| audio = MODEL.generate( | |
| text=sentence, | |
| audio_prompt_path=audio_prompt_path, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| else: | |
| # Try without reference audio | |
| try: | |
| audio = MODEL.generate( | |
| text=sentence, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| except TypeError: | |
| # If the model requires audio_prompt_path, try with empty string | |
| audio = MODEL.generate( | |
| text=sentence, | |
| audio_prompt_path="", | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| except Exception as model_error: | |
| # If the model fails due to missing reference audio, try with default behavior | |
| if "reference_voice.wav not found" in str(model_error) or "No reference audio provided" in str(model_error): | |
| print("Attempting generation without reference audio...") | |
| # Try different approaches for models that don't support None reference audio | |
| try: | |
| # Some models might accept an empty string | |
| audio = MODEL.generate( | |
| text=sentence, | |
| audio_prompt_path="", | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| except: | |
| # If that fails, try without the audio_prompt_path parameter entirely | |
| audio = MODEL.generate( | |
| text=sentence, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| else: | |
| raise model_error | |
| all_audio_segments.append(audio) | |
| total_duration += audio.shape[1] / 24000 | |
| # Concatenate all audio segments | |
| yield None, "Combining audio segments (95%)..." | |
| print("Combining audio segments...") | |
| combined_audio = torch.cat(all_audio_segments, dim=1) | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| output_path = tmp_file.name | |
| torchaudio.save(output_path, combined_audio, 24000) | |
| print("Multi-sentence processing complete!") | |
| yield output_path, f"Successfully generated speech from {len(sentences)} sentences! Total audio length: {total_duration:.2f} seconds" | |
| except Exception as e: | |
| error_msg = f"Error generating multi-sentence speech: {str(e)}" | |
| print(error_msg) | |
| yield None, error_msg | |
| # GPU version with decorator | |
| def _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42): | |
| """GPU version of multi-sentence speech generation""" | |
| for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=True): | |
| yield result | |
| # CPU version without decorator | |
| def _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42): | |
| """CPU version of multi-sentence speech generation""" | |
| for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=False): | |
| yield result | |
| # Router function | |
| def generate_speech_multi_sentence(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True): | |
| """Generate speech from text with multi-sentence support and progress tracking""" | |
| if use_gpu: | |
| for result in _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed): | |
| yield result | |
| else: | |
| for result in _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed): | |
| yield result | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| # Sample texts in Dhivehi | |
| sample_texts = [ | |
| "ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ", | |
| """ފުޓްބޯޅަ ސްކޫލްގެ ބިމާއި ގުދަންބަރި ބިމުގައި އިމާރާތް ކުރުމުގެ މަސައްކަތް ހުއްޓާލަން އަންގައިފި... | |
| Construction work on football school land and warehouse land has been ordered to stop""", | |
| "ސިވިލް ސާވިސްގެ ހިދުމަތުގެ މުއްދަތު ގުނުމުގައި ކުންފުނިތަކާއި އިދާރާތަކަށް ހިދުމަތްކުރި މުއްދަތު ހިމަނަނީ", | |
| """އެ ރަށުގެ ބިން ހިއްކުމާއި ބަނދަރުގެ ނެރު ބަދަލުކުރުމާއި ގޮނޑުދޮށް ހިމާޔަތް ކުރުމުގެ މަސައްކަތް އެމްޓީސީސީއާ މިނިސްޓްރީން ހަވާލުކުރީ މިދިޔަ މަހު ރައީސް އެ ރަށަށް ކުރެއްވި ދަތުރުފުޅުގައި. | |
| The ministry handed over the land reclamation, replacement of the port canal and beach protection to MTCC during the President's visit to the village last month""" | |
| ] | |
| with gr.Blocks(title="ChatterboxTTS - Dhivehi Text-to-Speech", css=""" | |
| .textbox1 textarea { | |
| font-size: 18px !important; | |
| font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important; | |
| line-height: 1.8 !important; | |
| direction: rtl !important; | |
| text-align: right !important; | |
| } | |
| """) as app: | |
| gr.Markdown("# 🎤 ChatterboxTTS - Dhivehi Text-to-Speech with Voice Cloning") | |
| gr.Markdown("Generate natural-sounding Dhivehi speech with voice cloning capabilities.") | |
| # Row 1: Text input and Reference audio | |
| with gr.Row(): | |
| text_input = gr.Textbox( | |
| label="Text to Convert", | |
| placeholder="Enter Dhivehi text here...", | |
| lines=6, | |
| value="""އައްޑޫގައި ވަކިވަކި ކައުންސިލްތައް ހަދަން ފެނޭތޯ ބަލަން ނަގާ ތާރީހީ، ފެންނަ ނުފެންނަ ވޯޓާ ގުޅޭ ބަހުސެއް މާދަމާ ރޭ "މިހާރު" އިން ބާއްވަން ނިންމައިފި. | |
| އައްޑޫ ސިޓީ ކައުންސިލުގެ ދަށުން އިދާރީ ގޮތުން ހުޅުދު އާއި މީދޫ އަދި ފޭދޫ ވަކިކޮށް. އެ ތިން ރަށުގައި ވަކިވަކި ކައުންސިލުތައް ހެދުމަށް ފެނޭތޯ ބެލުމަށް ތިން ރަށުގެ ރައްޔިތުންގެ މެދުގައި ފެންނަ ނުފެންނަ ވޯޓެއް ނަގަނީ އަންނަ ހޮނިހިރު ދުވަހު. | |
| ރައްޔިތުންގެ ހިޔާލު ހޯދުމުގެ އާންމު ވޯޓު ނެގުމުގެ ގާނޫނުގެ ދަށުން ނަގާ ފުރަތަމަ ވޯޓާ ގުޅޭގޮތުން "މިހާރު" އިން ބަހުސެއް ބާއްވަން ނިންމާފައިވާއިރު. އެ ބަހުސްގައި ބައިވެރިވެވަޑައިގަންނަވާނީ އައްޑޫގެ އެކި ދާއިރާތަކުގައި ތަޖުރިބާކާރު ބޭފުޅުން. | |
| """, | |
| rtl=True, | |
| elem_classes=["textbox1"] | |
| ) | |
| reference_audio = gr.Audio( | |
| label="Reference Voice Audio (optional - for voice cloning)", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| value="m2.wav" | |
| ) | |
| # Row 2: Example buttons | |
| gr.Markdown("**Quick Examples:**") | |
| with gr.Row(): | |
| sample_btn1 = gr.Button("Sample 1", size="sm") | |
| sample_btn2 = gr.Button("Sample 2", size="sm") | |
| sample_btn3 = gr.Button("Sample 3", size="sm") | |
| sample_btn4 = gr.Button("Sample 4", size="sm") | |
| # Row 2b: Reference Audio buttons | |
| gr.Markdown("**Reference Audio:**") | |
| with gr.Row(): | |
| ref_btn1 = gr.Button("Female 1 (f1.wav)", size="sm") | |
| ref_btn2 = gr.Button("Female 2 (f2.wav)", size="sm") | |
| ref_btn3 = gr.Button("Male 1 (m1.wav)", size="sm") | |
| ref_btn4 = gr.Button("Male 2 (m2.wav)", size="sm") | |
| # Row 3: Advanced settings | |
| with gr.Accordion("Advanced Settings", open=False): | |
| with gr.Row(): | |
| exaggeration = gr.Slider( | |
| minimum=0.0, | |
| maximum=5.0, | |
| value=0.5, | |
| step=0.1, | |
| label="Exaggeration", | |
| info="Controls expressiveness" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.01, | |
| maximum=1.0, | |
| value=0.8, | |
| step=0.01, | |
| label="Temperature", | |
| info="Controls randomness" | |
| ) | |
| cfg_weight = gr.Slider( | |
| minimum=0.0, | |
| maximum=5.0, | |
| value=0.5, | |
| step=0.1, | |
| label="CFG Weight", | |
| info="Classifier-free guidance weight" | |
| ) | |
| seed = gr.Slider( | |
| minimum=0, | |
| maximum=9999, | |
| value=42, | |
| step=1, | |
| label="Seed", | |
| info="For reproducible results" | |
| ) | |
| with gr.Row(): | |
| model_select = gr.Dropdown( | |
| #choices=["kn_cbox", "f01_cbox"], #f01 upload correct chkpnt | |
| choices=["kn_cbox"], | |
| value="kn_cbox", | |
| label="Model", | |
| info="Select TTS model" | |
| ) | |
| device_select = gr.Dropdown( | |
| choices=["GPU", "CPU"], | |
| value="GPU", | |
| label="Device", | |
| info="Select computation device" | |
| ) | |
| reload_btn = gr.Button("🔄 Reload Model", size="sm") | |
| reload_status = gr.Textbox(label="Model Status", value="✅ Model 'kn_cbox' loaded on GPU", interactive=False) | |
| gr.Markdown("**Note:** This fine-tune is minimal, so some words may drop or sentences might not complete perfectly. You can experiment with the Advanced Settings to find what works best for your reference audio and to reduce any output issues. This Space uses ZeroGPU for processing, so if your text is long, the GPU might be released before completion, which could cause a timeout. For longer inputs, switch to CPU mode from the Advanced Settings and wait for it to finish. It will run a bit slower, but it should still complete reliably.") | |
| # Row 4: Generate button | |
| generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") | |
| # Row 5: Output section | |
| with gr.Row(): | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Generated Speech", type="filepath") | |
| status_message = gr.Textbox(label="Status", interactive=False) | |
| # Event handlers | |
| # Default values for advanced settings | |
| DEFAULT_EXAGGERATION = 0.5 | |
| DEFAULT_TEMPERATURE = 0.8 | |
| DEFAULT_CFG_WEIGHT = 0.5 | |
| DEFAULT_SEED = 42 | |
| def set_sample_text(sample_idx): | |
| return sample_texts[sample_idx], DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED | |
| def set_reference_audio(audio_file): | |
| return audio_file, DEFAULT_EXAGGERATION, DEFAULT_TEMPERATURE, DEFAULT_CFG_WEIGHT, DEFAULT_SEED | |
| def reload_model_handler(model_name, device_name): | |
| """Reload model with selected checkpoint and device""" | |
| try: | |
| device = "cuda" if device_name == "GPU" else "cpu" | |
| load_model(checkpoint=model_name, device=device) | |
| return f"✅ Model '{model_name}' loaded successfully on {device_name}!" | |
| except Exception as e: | |
| return f"❌ Error loading model: {str(e)}" | |
| sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input, exaggeration, temperature, cfg_weight, seed]) | |
| sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input, exaggeration, temperature, cfg_weight, seed]) | |
| sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input, exaggeration, temperature, cfg_weight, seed]) | |
| sample_btn4.click(lambda: set_sample_text(3), outputs=[text_input, exaggeration, temperature, cfg_weight, seed]) | |
| ref_btn1.click(lambda: set_reference_audio("f1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed]) | |
| ref_btn2.click(lambda: set_reference_audio("f2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed]) | |
| ref_btn3.click(lambda: set_reference_audio("m1.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed]) | |
| ref_btn4.click(lambda: set_reference_audio("m2.wav"), outputs=[reference_audio, exaggeration, temperature, cfg_weight, seed]) | |
| reload_btn.click( | |
| fn=reload_model_handler, | |
| inputs=[model_select, device_select], | |
| outputs=[reload_status] | |
| ) | |
| def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed, device_name): | |
| """Generate speech with streaming progress updates""" | |
| use_gpu = (device_name == "GPU") | |
| # Use the streaming generator | |
| for result_audio, result_status in generate_speech_multi_sentence( | |
| text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu | |
| ): | |
| yield result_audio, result_status | |
| generate_btn.click( | |
| fn=generate_with_progress, | |
| inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select], | |
| outputs=[output_audio, status_message] | |
| ) | |
| # Parameter Examples Section | |
| gr.Markdown("### Examples") | |
| gr.Markdown("Click any example below to load pre-configured settings:") | |
| gr.Examples( | |
| examples=[ | |
| # [text, reference_audio, exaggeration, temperature, cfg_weight, seed, device] | |
| ["""އެއް ދުވަހަކު ސަލާންޖަހާ މީހަކު ޖުހާގެ ގޭގެ ދޮރުމައްޗަށް އަރާ ސަލާން ގޮވާލައިފިއެވެ. | |
| ސަލާންޖަހާ މީހާ އައިސް ސަލާން ގޮވާލި އިރު ޖުހާ އުޅެނީ ގޭގެ މަތީ ބުރީގައެވެ. | |
| “ކާކު؟ ކީއްކުރަން؟” ޖުހާ ގޭތެރެއިން ގޮވާލައިފިއެވެ. | |
| “އައިސްފާނަންތަ ތިރިއަށް؟” ސަލާންޖަހާ މީހާ ބުންޏެވެ. | |
| އޭނާ އެހެން ބުނުމުން ޖުހާ ތިރިއަށް ގޮސް ސަލާން ޖަހާ މީހާ އާ ބައްދަލު ކޮށްފިއެވެ. | |
| “ކިހިނެއްވީ؟” ސަލާންޖަހާ މީހާ ކުރެން ޖުހާ އަހައިފިއެވެ. | |
| “އަހަންނަކީ ވަރަށް ބޮޑު ނިކަމެއްޗެއް، ސަދަގާތެއްގެ ގޮތުން އަހަންނަށް އެހީއެއް ދޭތޯ!” ސަލާންޖަހާ މީހާ ބުންޏެވެ. | |
| “އާދޭ އެތެރެއަށް.” ޖުހާ ބުންޏެވެ. | |
| ޖުހާ އެހެން ބުނުމުން ސަލާންޖަހާ މީހާ ގޭތެރެއަށް ވަދެއްޖެއެވެ. ގޭތެރެއަށް ވަނުމުން މަށާއެކީ އަންނާށޭ ކިޔާ ޖުހާ ގޭގެ ސިޑިން މައްޗަށް އަރަން ފަށައިފިއެވެ. ސަލާން ޖަހާ މީހާ ވެސް ޖުހާގެ ފަހަތުން މައްޗަށް ދެއެވެ. މި ހެން ގޮސް އެމީހާ ގޮވައިގެން ގޮސް ގޭގެ ފުރާޅު މައްޗަށް އަރައިފިއެވެ. | |
| ފުރާޅު މައްޗަށް އެރުމާއެކު ޖުހާ ބުނެފިއެވެ. “މަގޭ އަތަކު ދޭނެ އެއްޗެއް ނެތް.” | |
| ސަލާންޖާހާމީހާ މިހާ ހިސާބަށް މައްޗަށް އެރުވުމަށް ފަހު ދޭނެ އެއްޗެއް ނެތޭ ޖުހާ ބުނުމުން އޭނާ ވަރަށް ހިތްހަމަ ނުޖެހިއްޖެއެވެ. “ކީއްވެ ތިހެން ތިހެދީ؟ އަހަރެން ދޮރުމަތީގައި ހުއްޓާވެސް ތިޔަހެން ބުނެލެވުނީހެއްނު! ކީއްކުރަން މިހާ ހިސާބަށް އަރުވާފައި ތިހެން ތިބުނީ؟” | |
| “އެހެން ވިއްޔާ ކީއްވެ’ އަހަރެން ތިރިއަށް ނުބާލާ، ތިޔަ ހޯދަން އުޅުނު އެހީއެއްގެ ވާހަކަ ނޭހީ؟ އެހެން ނަމަ މަށަށްވެސް އެއްޗެއް ނެތޭ ބުނެ ތިރިޔަށް ނުފައިބާ ފަރުޖެއްސުނީހެއްނު” ޖުހާ ސަލާންޖަހާ މީހާ އަށް ޖަވާބު ދިނެވެ.""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"], | |
| ["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "f1.wav", 0.5, 0.8, 0.5, 42, "GPU"], | |
| ["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި. | |
| A senior customs official told Miharu today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "f2.wav", 0.2, 0.35, 0.4, 42, "GPU"], | |
| ["""ޤައުމަށްޓަކާ ދީނަށްޓަކާ ކެރިގެން ޖިހާދު ކުރާނަމޭ. | |
| ފައުޅާއި ސިއްރާ އެއްގޮތަށް ހުރިހާ ކަމެއް ގެންދާނަމޭ، | |
| އަހުރެންގެ މޭ ޤައުމަށްޓަކައި އައްޑަނައަކަށް ދިއްކޮށްލުމީ. | |
| ފަހުނޭވަޔަށް ދަންދެން މޮޓޯ ކަމުގަައި ޚިޔާރުކުރާނަމޭ، | |
| އަންނާނެ ތީރެއް އުންޑައެއް ފެނިގެން އަމާޒުވެގެން މެއަށް. | |
| ގަންނާ ކުރެއްވި ބިރުން ފިލަން ދާމީހަކަށް މަ ނުވާނަމޭ، | |
| އެޅިފައިވި މަސްއޫލިއްޔަތެއް އުފުލަންދިމާވީމާ ދެނެއް. | |
| ފެޅިގެން ދެފަޅިއަށް ދިޔަޔަކަސް އެއަކުން މަށެއް ނުރެކޭނަމޭ، | |
| ކަމަކަށް ގޮވާލީމާ މިޤައުމުގެ ޢިއްޒަތާ އަބުރަށްޓަކައި. | |
| އަމަށުން ހުރީވިއްޔާ އެތާ އެކަކަށް މަވެސް ހުންނާނަމޭ، | |
| މިނިވަންކަމާ އެކުވެރިކަމާ ހަމަހަމަ ކަމަށް ތަރުހީބުދީ. | |
| ހިނިތުންވެ ތިބެ ދީނީ އުޚުއްވަތް ފެތުރުމަށް މަ ގޮވާނަމޭ، | |
| އަޚުނާއި އުޚުތުންނަށް އެދޭނީ ލާބަޔާ މަންފާތަކޭ. | |
| ބަޚުތާއިމެދު ނުރުހުންވެގެން ޝަކުވާތަކެއް ނުކުރާނަމޭ، | |
| އަނެކުންގެ ކުށްތައް ހޯދުމީ ނަފުސުގެ މަތިން ނައްތައި ހަނދާން.""", "m3.mp3", 0.5, 0.8, 0.5, 42, "GPU"], | |
| ["""ކަސްޓަމްސްގެ އިސް އޮފިޝަލަކު "މިހާރު" އަށް މިއަދު ވިދާޅުވި ގޮތުގައި، ކަސްޓަމްސް އިން ހިފެހެއްޓުމަށް ފަހު، އެ ދެ ކޮންޓެއިނަރު ހުޅުމާލެ ބަނދަރުގައި ބެހެއްޓީ އެ ބަނދަރު ބަލަހައްޓާ އެމްޕީއެލްގެ ހަވާލުގަ. އެމްޕީއެލްއާ އެ ހަވާލުކުރީ އޮންނަ އުސޫލުގެ ތެރެއިން، ލިޔެކިޔުންތައް ފުރިހަމަކޮށްފައި ކަމަށާއި އެ ސިނގިރެޓްތައް ނައްތާނުލައި ހުރީ ތަހުގީގު ނުނިމޭތީ ކަމަށް އޮފިޝަލް ވިދާޅުވި. | |
| A senior customs official told Mihaaru today that the two containers were seized by customs and placed in the custody of MPL. The cigarettes were handed over to MPL after completing the documents and the investigation was not completed the official said.""", "m1.wav", 0.2, 0.35, 0.4, 42, "GPU"], | |
| ["""ގެދޮރުވެރިޔާ މަޝްރޫއުގެ ދަށުން ހުޅުމާލޭގައި ފަހި ދިރިއުޅުން ކޯޕަރޭޝަން އިން އިމާރާތްކޮށްފައިވާ ފްލެޓްތައް ހަވާލުކުރަން ފެށުމާ އެކު ބޮޑު އަގުގައި އެތަންތަން ކުއްޔަށް ދޭން އިޝްތިހާރު ޖަހަން ފަށައިފި""", "m2.wav", 0.5, 0.8, 0.5, 42, "GPU"], | |
| ], | |
| inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select], | |
| outputs=[output_audio, status_message], | |
| fn=generate_with_progress, | |
| label="Preset Configurations", | |
| examples_per_page=8, | |
| cache_examples="lazy" | |
| ) | |
| # Instructions | |
| with gr.Accordion("Tips", open=False): | |
| gr.Markdown(""" | |
| ### General Use (TTS and Voice Agents): | |
| - The default settings (exaggeration=0.5, cfg=0.5) work well for most prompts. | |
| - If the reference speaker has a fast speaking style, lowering cfg to around 0.3 can improve pacing. | |
| ### Expressive or Dramatic Speech: | |
| - Try lower cfg values (e.g. ~0.3) and increase exaggeration to around 0.7 or higher. | |
| - Higher exaggeration tends to speed up speech; reducing cfg helps compensate with slower, more deliberate pacing. | |
| ### Language Transfer Notes: | |
| - Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. | |
| - To mitigate this, set the CFG weight to 0. | |
| ### Additional Tips: | |
| - For best voice cloning results, use clear audio with minimal background noise | |
| - The reference audio should be 3-10 seconds long | |
| - Use the same seed value for reproducible results | |
| """) | |
| return app | |
| if __name__ == "__main__": | |
| # Step 1: Download model files first | |
| print("\nStarting ChatterboxTTS Dhivehi Application") | |
| print("=" * 60) | |
| download_model() | |
| # Step 2: Load the default model with GPU | |
| print("\nLoading default model...") | |
| print("=" * 60) | |
| try: | |
| load_model(checkpoint="kn_cbox", device="cuda") | |
| print("Default model loaded successfully!") | |
| except Exception as e: | |
| print(f"Warning: Could not load default model: {e}") | |
| print("You can manually load the model using the 'Reload Model' button in the interface.") | |
| print("=" * 60) | |
| # Step 3: Create and launch the interface | |
| print("\nCreating Gradio interface...") | |
| app = create_interface() | |
| # Step 4: Launch with public sharing and authentication if needed | |
| print("Launching application...") | |
| print("=" * 60) | |
| app.launch() | |