Update README.md
Browse files
README.md
CHANGED
|
@@ -63,6 +63,53 @@ This model does not perform watermarking for two reasons:
|
|
| 63 |
This model is provided primarily for the purpose of scientific comparisons on public benchmarks.
|
| 64 |
In particular, please check our pipeline for running TTS model evaluations on a number of benchmarks: [tts_longeval](https://github.com/kyutai-labs/tts_longeval).
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
## Model Card Authors
|
| 67 |
|
| 68 |
Neil Zeghidour, Eugene Kharitonov, Manu Orsini, Václav Volhejn, Gabriel de Marmiesse, Edouard Grave, Patrick Perez, Laurent Mazaré, Alexandre Défossez
|
|
|
|
| 63 |
This model is provided primarily for the purpose of scientific comparisons on public benchmarks.
|
| 64 |
In particular, please check our pipeline for running TTS model evaluations on a number of benchmarks: [tts_longeval](https://github.com/kyutai-labs/tts_longeval).
|
| 65 |
|
| 66 |
+
Here is an example, first install `moshi`, for instance with
|
| 67 |
+
```bash
|
| 68 |
+
pip install -U "git+https://[email protected]/kyutai-labs/moshi.git#egg=moshi&subdirectory=moshi
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
```python
|
| 72 |
+
import torch
|
| 73 |
+
from moshi.models.loaders import CheckpointInfo
|
| 74 |
+
from moshi.models.tts import DEFAULT_DSM_TTS_VOICE_REPO, TTSModel
|
| 75 |
+
|
| 76 |
+
text = "Hey there! How are you? I had the craziest day today."
|
| 77 |
+
voice = "expresso/ex03-ex01_happy_001_channel1_334s.wav"
|
| 78 |
+
|
| 79 |
+
checkpoint_info = CheckpointInfo.from_hf_repo('kyutai/tts-0.75b-en-public')
|
| 80 |
+
tts_model = TTSModel.from_checkpoint_info(
|
| 81 |
+
checkpoint_info, n_q=16, temp=0.6, cfg_coef=3, device=torch.device("cuda")
|
| 82 |
+
)
|
| 83 |
+
entries = tts_model.prepare_script([text], padding_between=1)
|
| 84 |
+
# `voice` could also be a local wav file.
|
| 85 |
+
voice_path = tts_model.get_voice_path(voice)
|
| 86 |
+
prefix = tts_model.get_prefix(voice_path)
|
| 87 |
+
|
| 88 |
+
print("Generating audio...")
|
| 89 |
+
pcms = []
|
| 90 |
+
def _on_frame(frame):
|
| 91 |
+
print("Step", len(pcms), end="\r")
|
| 92 |
+
if (frame[:, 1:] != -1).all():
|
| 93 |
+
pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu()
|
| 94 |
+
pcms.append(pcm.clip(-1, 1))
|
| 95 |
+
|
| 96 |
+
# You could also generate multiple audios at once by extending the following lists.
|
| 97 |
+
all_entries = [entries]
|
| 98 |
+
prefixes = [prefix]
|
| 99 |
+
with tts_model.mimi.streaming(len(all_entries)):
|
| 100 |
+
result = tts_model.generate(all_entries, [], on_frame=_on_frame, prefixes=prefixes)
|
| 101 |
+
|
| 102 |
+
print("Done generating.")
|
| 103 |
+
audios = torch.cat(pcms, dim=-1)
|
| 104 |
+
|
| 105 |
+
for audio, prefix in zip(audios, prefixes):
|
| 106 |
+
# We need to skip the audio prefix.
|
| 107 |
+
skip = int((tts_model.mimi.sample_rate * prefix.shape[-1]) / tts_model.mimi.frame_rate)
|
| 108 |
+
audio = audio[..., skip:]
|
| 109 |
+
# Now do something with this audio!
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
|
| 113 |
## Model Card Authors
|
| 114 |
|
| 115 |
Neil Zeghidour, Eugene Kharitonov, Manu Orsini, Václav Volhejn, Gabriel de Marmiesse, Edouard Grave, Patrick Perez, Laurent Mazaré, Alexandre Défossez
|