Uploaded model
- Developed by: stgreenman
- License: apache-2.0
- Finetuned from model : unsloth/csm-1b
Usage
from transformers import CsmForConditionalGeneration, AutoProcessor
from IPython.display import Audio, display
import soundfile as sf
import torch
# Load with the correct model class
model = CsmForConditionalGeneration.from_pretrained("stgreenman/sesame-colab-v1")
processor = AutoProcessor.from_pretrained("stgreenman/sesame-colab-v1")
# Move model to CUDA
model = model.to("cuda")
# TTS function (exactly from your original)
def generate_speech(text, speaker_id=0):
inputs = processor(f"[{speaker_id}]{text}", add_special_tokens=True).to("cuda")
audio_values = model.generate(
**inputs,
max_new_tokens=125,
depth_decoder_temperature=0.6,
depth_decoder_top_k=0,
depth_decoder_top_p=0.9,
temperature=0.8,
top_k=50,
top_p=1.0,
output_audio=True
)
audio = audio_values[0].to(torch.float32).cpu().numpy()
return audio
# Test
text = "Hello, this is my fine-tuned voice!"
audio = generate_speech(text)
sf.write("test_output.wav", audio, 24000)
display(Audio(audio, rate=24000))
This csm model was trained 2x faster with Unsloth and Huggingface's TRL library.
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support
