Spaces:

alakxender
/

chatterbox-tts-dhivehi

Running on Zero

App Files Files Community

alakxender commited on about 1 month ago

Commit

d735744

1 Parent(s): 723c802

t

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +449 -4
cbox_test.py +79 -0
chatterbox_dhivehi py +210 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Chatterbox Tts Dhivehi
 emoji: 📉
 colorFrom: red
 colorTo: blue

 ---
+title: Chatterbox TTS Dhivehi
 emoji: 📉
 colorFrom: red
 colorTo: blue

app.py CHANGED Viewed

@@ -1,7 +1,452 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from pathlib import Path
+import os
+try:
+    from huggingface_hub import snapshot_download
+    _target = Path.home() / ".chatterbox-tts-dhivehi"
+    if not (_target.exists() and any(_target.rglob("*"))):
+        snapshot_download(
+            repo_id="alakxender/chatterbox-tts-dhivehi",
+            local_dir=str(_target),
+            local_dir_use_symlinks=False,
+            resume_download=True
+        )
+except Exception as _e:
+    pass
+from chatterbox.tts import ChatterboxTTS
+import torchaudio
+import torch
+import random
+import numpy as np
 import gradio as gr
+import tempfile
+import os
+import chatterbox_dhivehi
+import warnings
+warnings.filterwarnings("ignore")
+chatterbox_dhivehi.extend_dhivehi()
+class TTSApp:
+    def __init__(self, checkpoint=f"{_target}/kn_cbox"):
+        self.checkpoint = checkpoint
+        self.model = None
+        self.load_model()
+    def load_model(self):
+        """Load the TTS model"""
+        try:
+            print(f"Loading model with checkpoint: {self.checkpoint}")
+            self.model = ChatterboxTTS.from_dhivehi(
+                ckpt_dir=Path(self.checkpoint),
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            print("Model loaded successfully!")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            raise e
+    def set_seed(self, seed: int):
+        """Set random seed for reproducibility"""
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+    def generate_speech(self,
+                       text,
+                       reference_audio,
+                       exaggeration=0.5,
+                       temperature=0.1,
+                       cfg_weight=0.5,
+                       seed=42):
+        """Generate speech from text using voice cloning"""
+        # Clean the input text
+        text = self.clean_text(text)
+        if not text:
+            return None, "Please enter some text to generate speech."
+        if self.model is None:
+            return None, "Model not loaded. Please check your model paths."
+        try:
+            # Set seed for reproducibility
+            self.set_seed(seed)
+            # Handle reference audio - make it optional
+            audio_prompt_path = reference_audio
+            print(f"Generating audio for: {text[:50]}...")
+            if audio_prompt_path:
+                print(f"Using reference audio: {audio_prompt_path}")
+            else:
+                print("Generating without reference audio")
+            # Generate audio - handle optional reference audio
+            if audio_prompt_path:
+                audio = self.model.generate(
+                    text=text,
+                    audio_prompt_path=audio_prompt_path,
+                    exaggeration=exaggeration,
+                    temperature=temperature,
+                    cfg_weight=cfg_weight,
+                )
+            else:
+                # Try without reference audio
+                try:
+                    audio = self.model.generate(
+                        text=text,
+                        exaggeration=exaggeration,
+                        temperature=temperature,
+                        cfg_weight=cfg_weight,
+                    )
+                except TypeError:
+                    # If the model requires audio_prompt_path, try with empty string
+                    audio = self.model.generate(
+                        text=text,
+                        audio_prompt_path="",
+                        exaggeration=exaggeration,
+                        temperature=temperature,
+                        cfg_weight=cfg_weight,
+                    )
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                output_path = tmp_file.name
+            torchaudio.save(output_path, audio, 24000)
+            return output_path, f"Successfully generated speech! Audio length: {audio.shape[1]/24000:.2f} seconds"
+        except Exception as e:
+            error_msg = f"Error generating speech: {str(e)}"
+            print(error_msg)
+            return None, error_msg
+    def clean_text(self, text):
+        """Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
+        import re
+        # Remove newlines at start and end
+        text = text.strip('\n\r')
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Strip leading and trailing spaces
+        text = text.strip()
+        return text
+    def split_sentences(self, text):
+        """Split text into sentences based on periods, ensuring each sentence is at least 150 characters"""
+        # Clean the input text first
+        text = self.clean_text(text)
+        # First, split by periods normally
+        initial_sentences = []
+        current_sentence = ""
+        for char in text:
+            current_sentence += char
+            if char == '.':
+                # Add sentence if it's not empty after stripping spaces from both sides
+                stripped_sentence = current_sentence.strip()
+                if stripped_sentence:
+                    initial_sentences.append(stripped_sentence)
+                current_sentence = ""
+        # Add remaining text if any (without period), stripped of spaces from both sides
+        stripped_remaining = current_sentence.strip()
+        if stripped_remaining:
+            initial_sentences.append(stripped_remaining)
+        # If we only have one sentence, return it
+        if len(initial_sentences) <= 1:
+            return initial_sentences
+        # Now combine sentences until each is at least 150 characters
+        final_sentences = []
+        combined_sentence = ""
+        for sentence in initial_sentences:
+            if combined_sentence:
+                combined_sentence += " " + sentence
+            else:
+                combined_sentence = sentence
+            # If combined sentence is >= 150 chars, add it to final list
+            if len(combined_sentence) >= 150:
+                final_sentences.append(combined_sentence.strip())
+                combined_sentence = ""
+        # Add any remaining combined sentence (even if < 150 chars)
+        if combined_sentence.strip():
+            final_sentences.append(combined_sentence.strip())
+        return final_sentences
+    def generate_speech_multi_sentence(self,
+                                     text,
+                                     reference_audio,
+                                     exaggeration=0.5,
+                                     temperature=0.1,
+                                     cfg_weight=0.5,
+                                     seed=42):
+        """Generate speech from text with multi-sentence support and progress tracking"""
+        # Clean the input text
+        text = self.clean_text(text)
+        if not text:
+            yield None, "Please enter some text to generate speech."
+            return
+        if self.model is None:
+            yield None, "Model not loaded. Please check your model paths."
+            return
+        # Split text into sentences
+        sentences = self.split_sentences(text)
+        # If only one sentence or no periods, use regular method
+        if len(sentences) <= 1:
+            yield None, "🎵 Generating single sentence..."
+            result_audio, result_status = self.generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
+            yield result_audio, result_status
+            return
+        try:
+            # Set seed for reproducibility
+            self.set_seed(seed)
+            # Handle reference audio - make it optional
+            audio_prompt_path = reference_audio
+            yield None, f"🚀 Starting generation for {len(sentences)} sentences..."
+            print(f"Processing {len(sentences)} sentences...")
+            all_audio_segments = []
+            total_duration = 0
+            for i, sentence in enumerate(sentences):
+                # Calculate progress percentage
+                progress_percent = int((i / len(sentences)) * 90)  # Reserve last 10% for combining
+                yield None, f"🎵 Generating sentence {i+1}/{len(sentences)} ({progress_percent}%): {sentence[:50]}..."
+                print(f"Generating audio for sentence {i+1}/{len(sentences)}: {sentence[:50]}...")
+                # Generate audio for this sentence
+                try:
+                    if audio_prompt_path:
+                        audio = self.model.generate(
+                            text=sentence,
+                            audio_prompt_path=audio_prompt_path,
+                            exaggeration=exaggeration,
+                            temperature=temperature,
+                            cfg_weight=cfg_weight,
+                        )
+                    else:
+                        # Try without reference audio
+                        try:
+                            audio = self.model.generate(
+                                text=sentence,
+                                exaggeration=exaggeration,
+                                temperature=temperature,
+                                cfg_weight=cfg_weight,
+                            )
+                        except TypeError:
+                            # If the model requires audio_prompt_path, try with empty string
+                            audio = self.model.generate(
+                                text=sentence,
+                                audio_prompt_path="",
+                                exaggeration=exaggeration,
+                                temperature=temperature,
+                                cfg_weight=cfg_weight,
+                            )
+                except Exception as model_error:
+                    # If the model fails due to missing reference audio, try with default behavior
+                    if "reference_voice.wav not found" in str(model_error) or "No reference audio provided" in str(model_error):
+                        print("Attempting generation without reference audio...")
+                        # Try different approaches for models that don't support None reference audio
+                        try:
+                            # Some models might accept an empty string
+                            audio = self.model.generate(
+                                text=sentence,
+                                audio_prompt_path="",
+                                exaggeration=exaggeration,
+                                temperature=temperature,
+                                cfg_weight=cfg_weight,
+                            )
+                        except:
+                            # If that fails, try without the audio_prompt_path parameter entirely
+                            audio = self.model.generate(
+                                text=sentence,
+                                exaggeration=exaggeration,
+                                temperature=temperature,
+                                cfg_weight=cfg_weight,
+                            )
+                    else:
+                        raise model_error
+                all_audio_segments.append(audio)
+                total_duration += audio.shape[1] / 24000
+            # Concatenate all audio segments
+            yield None, "🔧 Combining audio segments (95%)..."
+            print("Combining audio segments...")
+            combined_audio = torch.cat(all_audio_segments, dim=1)
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                output_path = tmp_file.name
+            torchaudio.save(output_path, combined_audio, 24000)
+            print("Multi-sentence processing complete!")
+            yield output_path, f"✅ Successfully generated speech from {len(sentences)} sentences! Total audio length: {total_duration:.2f} seconds"
+        except Exception as e:
+            error_msg = f"❌ Error generating multi-sentence speech: {str(e)}"
+            print(error_msg)
+            yield None, error_msg
+def get_cbox_dv():
+    """Create the Gradio interface"""
+    # Initialize the TTS app
+    tts_app = TTSApp()
+    # Sample texts in Dhivehi
+    sample_texts = [
+        "ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
+        """ފުޓްބޯޅަ ސްކޫލްގެ ބިމާއި ގުދަންބަރި ބިމުގައި އިމާރާތް ކުރުމުގެ މަސައްކަތް ހުއްޓާލަން އަންގައިފި...
+Construction work on football school land and warehouse land has been ordered to stop""",
+        "ސިވިލް ސާވިސްގެ ހިދުމަތުގެ މުއްދަތު ގުނުމުގައި ކުންފުނިތަކާއި އިދާރާތަކަށް ހިދުމަތްކުރި މުއްދަތު ހިމަނަނީ",
+        """އެ ރަށުގެ ބިން ހިއްކުމާއި ބަނދަރުގެ ނެރު ބަދަލުކުރުމާއި ގޮނޑުދޮށް ހިމާޔަތް ކުރުމުގެ މަސައްކަތް އެމްޓީސީސީއާ މިނިސްޓްރީން ހަވާލުކުރީ މިދިޔަ މަހު ރައީސް އެ ރަށަށް ކުރެއްވި ދަތުރުފުޅުގައި.
+The ministry handed over the land reclamation, replacement of the port canal and beach protection to MTCC during the President's visit to the village last month"""
+    ]
+    with gr.Tab("🎤 ChatterboxTTS"):
+        gr.Markdown("# 🎤 ChatterboxTTS - Dhivehi Text-to-Speech with Voice Cloning")
+        gr.Markdown("Generate natural-sounding Dhivehi speech with voice cloning capabilities.")
+        # Row 1: Text input and Reference audio
+        with gr.Row():
+            text_input = gr.Textbox(
+                label="Text to Convert",
+                placeholder="Enter Dhivehi text here...",
+                lines=6,
+                value=sample_texts[0],
+                rtl=True,
+                elem_classes=["textbox1"]
+            )
+            reference_audio = gr.Audio(
+                label="Reference Voice Audio (optional - for voice cloning)",
+                type="filepath",
+                sources=["upload", "microphone"],
+            )
+        # Row 2: Example buttons
+        gr.Markdown("**Quick Examples:**")
+        with gr.Row():
+            sample_btn1 = gr.Button("Sample 1", size="sm")
+            sample_btn2 = gr.Button("Sample 2", size="sm")
+            sample_btn3 = gr.Button("Sample 3", size="sm")
+            sample_btn4 = gr.Button("Sample 4", size="sm")
+        # Row 3: Advanced settings
+        with gr.Accordion("Advanced Settings", open=False):
+            with gr.Row():
+                exaggeration = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.5,
+                    step=0.1,
+                    label="Exaggeration",
+                    info="Controls expressiveness"
+                )
+                temperature = gr.Slider(
+                    minimum=0.01,
+                    maximum=1.0,
+                    value=0.35,
+                    step=0.01,
+                    label="Temperature",
+                    info="Controls randomness"
+                )
+                cfg_weight = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.3,
+                    step=0.1,
+                    label="CFG Weight",
+                    info="Classifier-free guidance weight"
+                )
+                seed = gr.Slider(
+                    minimum=0,
+                    maximum=9999,
+                    value=42,
+                    step=1,
+                    label="Seed",
+                    info="For reproducible results"
+                )
+        # Row 4: Generate button
+        generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        # Row 5: Output section
+        with gr.Row():
+            with gr.Column():
+                output_audio = gr.Audio(label="Generated Speech", type="filepath")
+                status_message = gr.Textbox(label="Status", interactive=False)
+        # Event handlers
+        def set_sample_text(sample_idx):
+            return sample_texts[sample_idx]
+        sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input])
+        sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input])
+        sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input])
+        sample_btn4.click(lambda: set_sample_text(3), outputs=[text_input])
+        def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
+            """Generate speech with streaming progress updates"""
+            # Use the streaming generator from the TTS app
+            for result_audio, result_status in tts_app.generate_speech_multi_sentence(
+                text, reference_audio, exaggeration, temperature, cfg_weight, seed
+            ):
+                yield result_audio, result_status
+        generate_btn.click(
+            fn=generate_with_progress,
+            inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed],
+            outputs=[output_audio, status_message]
+        )
+        # Instructions
+        with gr.Accordion("Tips", open=False):
+            gr.Markdown("""
+            ### General Use (TTS and Voice Agents):
+            - The default settings (exaggeration=0.5, cfg=0.5) work well for most prompts.
+            - If the reference speaker has a fast speaking style, lowering cfg to around 0.3 can improve pacing.
+            ### Expressive or Dramatic Speech:
+            - Try lower cfg values (e.g. ~0.3) and increase exaggeration to around 0.7 or higher.
+            - Higher exaggeration tends to speed up speech; reducing cfg helps compensate with slower, more deliberate pacing.
+            ### Language Transfer Notes:
+            - Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language.
+            - To mitigate this, set the CFG weight to 0.
+            ### Additional Tips:
+            - For best voice cloning results, use clear audio with minimal background noise
+            - The reference audio should be 3-10 seconds long
+            - Use the same seed value for reproducible results
+            """)
+    return app

cbox_test.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from pathlib import Path
+import os
+try:
+    from huggingface_hub import snapshot_download
+    _target = Path.home() / ".chatterbox-tts-dhivehi"
+    if not (_target.exists() and any(_target.rglob("*"))):
+        snapshot_download(
+            repo_id="alakxender/chatterbox-tts-dhivehi",
+            local_dir=str(_target),
+            local_dir_use_symlinks=False,
+            resume_download=True
+        )
+except Exception as _e:
+    pass
+from chatterbox.tts import ChatterboxTTS
+import chatterbox_dhivehi
+import torchaudio
+import torch
+import numpy as np
+import random
+# ---- User settings (edit these) ----
+CKPT_DIR = f"{_target}/kn_cbox"  # path to your finetuned checkpoint dir
+REF_WAV = f"{_target}/samples/reference_audio.wav"                                              # optional 3–10s clean reference; "" to disable
+#REF_WAV = ""
+TEXT = "މި ރިޕޯޓާ ގުޅޭ ގޮތުން އެނިމަލް ވެލްފެއާ މިނިސްޓްރީން އަދި ވާހަކައެއް ނުދައްކާ"  # sample Dhivehi text
+TEXT = f"{TEXT}, The Animal Welfare Ministry has not yet commented on the report"
+EXAGGERATION = 0.4
+TEMPERATURE = 0.3
+CFG_WEIGHT = 0.7
+SEED = 42
+SAMPLE_RATE = 24000
+OUT_PATH = "out.wav"
+# ------------------------------------
+# Extend Dhivehi support from local file
+chatterbox_dhivehi.extend_dhivehi()
+# Seed for reproducibility
+torch.manual_seed(SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(SEED)
+    torch.cuda.manual_seed_all(SEED)
+random.seed(SEED)
+np.random.seed(SEED)
+# Load model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading ChatterboxTTS from: {CKPT_DIR} on {device}")
+model = ChatterboxTTS.from_dhivehi(ckpt_dir=Path(CKPT_DIR), device=device)
+print("Model loaded.")
+# Generate (reference audio optional)
+print(f"Generating audio... ref={'yes' if REF_WAV else 'no'}")
+gen_kwargs = dict(
+    text=TEXT,
+    exaggeration=EXAGGERATION,
+    temperature=TEMPERATURE,
+    cfg_weight=CFG_WEIGHT,
+)
+try:
+    if REF_WAV:
+        gen_kwargs["audio_prompt_path"] = REF_WAV
+        audio = model.generate(**gen_kwargs)
+    else:
+        # Try without reference first; if backend requires audio_prompt_path, fall back to ""
+        try:
+            audio = model.generate(**gen_kwargs)
+        except TypeError:
+            gen_kwargs["audio_prompt_path"] = ""
+            audio = model.generate(**gen_kwargs)
+except Exception as e:
+    raise RuntimeError(f"Generation failed: {e}")
+# Save
+torchaudio.save(OUT_PATH, audio, SAMPLE_RATE)
+dur = audio.shape[1] / SAMPLE_RATE
+print(f"Saved {OUT_PATH} ({dur:.2f}s)")

chatterbox_dhivehi py ADDED Viewed

	@@ -0,0 +1,210 @@

+# chatterbox_dhivehi.py
+"""
+Dhivehi extension for ChatterboxTTS.
+Requires: chatterbox-tts 0.1.4 (not tested on any other version)
+Adds:
+  - load_t3_with_vocab(state_dict, device, force_vocab_size): load T3 with a specific vocab size,
+    resizing both the embedding and the projection head, and padding checkpoint weights if needed.
+  - from_dhivehi(...): classmethod for building a ChatterboxTTS from a checkpoint directory,
+    using load_t3_with_vocab under the hood (defaults to vocab=2000).
+  - extend_dhivehi(): attach the above to ChatterboxTTS (idempotent).
+Usage in app.py:
+    import chatterbox_dhivehi
+    chatterbox_dhivehi.extend_dhivehi()
+    self.model = ChatterboxTTS.from_dhivehi(
+        ckpt_dir=Path(self.checkpoint),
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        force_vocab_size=2000,
+    )
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from safetensors.torch import load_file
+# Core chatterbox imports
+from chatterbox.tts import ChatterboxTTS, Conditionals
+from chatterbox.models.t3 import T3
+from chatterbox.models.s3gen import S3Gen
+from chatterbox.models.tokenizers import EnTokenizer
+from chatterbox.models.voice_encoder import VoiceEncoder
+# Helpers
+def _expand_or_trim_rows(t: torch.Tensor, new_rows: int, init_std: float = 0.02) -> torch.Tensor:
+    """
+    Return a tensor with first dimension resized to `new_rows`.
+    If expanding, newly added rows are randomly initialized N(0, init_std).
+    """
+    old_rows = t.shape[0]
+    if new_rows == old_rows:
+        return t.clone()
+    if new_rows < old_rows:
+        return t[:new_rows].clone()
+    # expand
+    out = t.new_empty((new_rows,) + t.shape[1:])
+    out[:old_rows] = t
+    out[old_rows:].normal_(mean=0.0, std=init_std)
+    return out
+def _prepare_resized_state_dict(sd: dict, new_vocab: int, init_std: float = 0.02) -> dict:
+    """
+    Create a modified copy of `sd` where text_emb/text_head weights (and bias) match `new_vocab`.
+    """
+    sd = sd.copy()
+    # text embedding: [vocab, dim]
+    if "text_emb.weight" in sd:
+        sd["text_emb.weight"] = _expand_or_trim_rows(sd["text_emb.weight"], new_vocab, init_std)
+    # text projection head: Linear(out=vocab, in=dim)
+    if "text_head.weight" in sd:
+        sd["text_head.weight"] = _expand_or_trim_rows(sd["text_head.weight"], new_vocab, init_std)
+    if "text_head.bias" in sd:
+        bias = sd["text_head.bias"]
+        if bias.ndim == 1:
+            sd["text_head.bias"] = _expand_or_trim_rows(bias.unsqueeze(1), new_vocab, init_std).squeeze(1)
+    return sd
+def _resize_model_vocab_layers(model: T3, new_vocab: int, dim: Optional[int] = None) -> None:
+    """
+    Rebuild model.text_emb and model.text_head to match `new_vocab`.
+    Embedding dim is inferred from existing layers if not provided.
+    """
+    if dim is None:
+        if hasattr(model, "text_emb") and isinstance(model.text_emb, nn.Embedding):
+            dim = model.text_emb.embedding_dim
+        elif hasattr(model, "text_head") and isinstance(model.text_head, nn.Linear):
+            dim = model.text_head.in_features
+        else:
+            raise RuntimeError("Cannot infer text embedding dimension from T3 model.")
+    model.text_emb = nn.Embedding(new_vocab, dim)
+    model.text_head = nn.Linear(dim, new_vocab, bias=True)
+# Public api
+def load_t3_with_vocab(
+    t3_state_dict: dict,
+    device: str = "cpu",
+    *,
+    force_vocab_size: Optional[int] = None,
+    init_std: float = 0.02,
+) -> T3:
+    """
+    Load a T3 model with a specified vocabulary size.
+    - Removes a leading "t3." prefix on state_dict keys if present.
+    - Resizes BOTH `text_emb` and `text_head` to `force_vocab_size` (or to the checkpoint vocab if not forced).
+    - Pads checkpoint weights when the target vocab is larger than the checkpoint's.
+    Args:
+        t3_state_dict: state dict loaded from t3_cfg.safetensors (or similar).
+        device: "cpu", "cuda", or "mps".
+        force_vocab_size: desired vocab size (e.g., 2000 for Dhivehi-extended models).
+        init_std: std for random init of padded rows.
+    Returns:
+        T3: model moved to `device` and set to eval().
+    """
+    logger = logging.getLogger(__name__)
+    # Strip "t3." prefix if present
+    if any(k.startswith("t3.") for k in t3_state_dict.keys()):
+        t3_state_dict = {k[len("t3."):]: v for k, v in t3_state_dict.items()}
+    # derive checkpoint vocab if available
+    ckpt_vocab_size = None
+    if "text_emb.weight" in t3_state_dict and t3_state_dict["text_emb.weight"].ndim == 2:
+        ckpt_vocab_size = int(t3_state_dict["text_emb.weight"].shape[0])
+    elif "text_head.weight" in t3_state_dict and t3_state_dict["text_head.weight"].ndim == 2:
+        ckpt_vocab_size = int(t3_state_dict["text_head.weight"].shape[0])
+    target_vocab = int(force_vocab_size) if force_vocab_size is not None else ckpt_vocab_size
+    if target_vocab is None:
+        raise RuntimeError("Could not determine vocab size. Provide force_vocab_size.")
+    logger.info(f"Loading T3 with vocab={target_vocab} (ckpt_vocab={ckpt_vocab_size})")
+    # Build a base model and resize layers to accept the incoming state dict
+    t3 = T3()
+    _resize_model_vocab_layers(t3, target_vocab)
+    # Patch the checkpoint tensors to the target vocab
+    patched_sd = _prepare_resized_state_dict(t3_state_dict, target_vocab, init_std)
+    # Load (strict=False to tolerate benign extra/missing keys)
+    t3.load_state_dict(patched_sd, strict=False)
+    return t3.to(device).eval()
+def from_dhivehi(
+    cls,
+    *,
+    ckpt_dir: Union[str, Path],
+    device: str = "cpu",
+    force_vocab_size: int = 1199,
+):
+    """
+    Construct a Dhivehi-extended ChatterboxTTS from a checkpoint directory.
+    Expected files in `ckpt_dir`:
+      - ve.safetensors
+      - t3_cfg.safetensors
+      - s3gen.safetensors
+      - tokenizer.json
+      - conds.pt (optional)
+    """
+    ckpt_dir = Path(ckpt_dir)
+    # Voice encoder
+    ve = VoiceEncoder()
+    ve.load_state_dict(load_file(ckpt_dir / "ve.safetensors"))
+    ve.to(device).eval()
+    # T3 with Dhivehi vocab extension
+    t3_state = load_file(ckpt_dir / "t3_cfg.safetensors")
+    t3 = load_t3_with_vocab(t3_state, device=device, force_vocab_size=force_vocab_size)
+    # S3Gen
+    s3gen = S3Gen()
+    s3gen.load_state_dict(load_file(ckpt_dir / "s3gen.safetensors"), strict=False)
+    s3gen.to(device).eval()
+    # Tokenizer
+    tokenizer = EnTokenizer(str(ckpt_dir / "tokenizer.json"))
+    # Optional conditionals
+    conds = None
+    conds_path = ckpt_dir / "conds.pt"
+    if conds_path.exists():
+        # Always safe-load to CPU first; .to(device) later
+        conds = Conditionals.load(conds_path, map_location="cpu").to(device)
+    return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
+def extend_dhivehi():
+    """
+    Attach Dhivehi-specific helpers to ChatterboxTTS (idempotent).
+    - ChatterboxTTS.load_t3_with_vocab (staticmethod)
+    - ChatterboxTTS.from_dhivehi (classmethod)
+    """
+    if getattr(ChatterboxTTS, "_dhivehi_extended", False):
+        return
+    ChatterboxTTS.load_t3_with_vocab = staticmethod(load_t3_with_vocab)
+    ChatterboxTTS.from_dhivehi = classmethod(from_dhivehi)
+    ChatterboxTTS._dhivehi_extended = True

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chatterbox-tts==0.1.4