|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import os | 
					
						
						|  | import sys | 
					
						
						|  | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) | 
					
						
						|  | sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) | 
					
						
						|  |  | 
					
						
						|  | import argparse | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import numpy as np | 
					
						
						|  | import torch | 
					
						
						|  | torch.set_num_threads(1) | 
					
						
						|  | import torchaudio | 
					
						
						|  | import random | 
					
						
						|  | import librosa | 
					
						
						|  | from transformers import pipeline | 
					
						
						|  | import subprocess | 
					
						
						|  | from scipy.signal import resample | 
					
						
						|  |  | 
					
						
						|  | import logging | 
					
						
						|  | logging.getLogger('matplotlib').setLevel(logging.WARNING) | 
					
						
						|  |  | 
					
						
						|  | from cosyvoice.cli.cosyvoice import CosyVoice | 
					
						
						|  | from cosyvoice.utils.file_utils import load_wav, speed_change | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def generate_seed(): | 
					
						
						|  | seed = random.randint(1, 100000000) | 
					
						
						|  | return { | 
					
						
						|  | "__type__": "update", | 
					
						
						|  | "value": seed | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | def set_all_random_seed(seed): | 
					
						
						|  | random.seed(seed) | 
					
						
						|  | np.random.seed(seed) | 
					
						
						|  | torch.manual_seed(seed) | 
					
						
						|  | torch.cuda.manual_seed_all(seed) | 
					
						
						|  |  | 
					
						
						|  | max_val = 0.8 | 
					
						
						|  | def postprocess(speech, top_db=60, hop_length=220, win_length=440): | 
					
						
						|  | speech, _ = librosa.effects.trim( | 
					
						
						|  | speech, top_db=top_db, | 
					
						
						|  | frame_length=win_length, | 
					
						
						|  | hop_length=hop_length | 
					
						
						|  | ) | 
					
						
						|  | if speech.abs().max() > max_val: | 
					
						
						|  | speech = speech / speech.abs().max() * max_val | 
					
						
						|  | speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) | 
					
						
						|  | return speech | 
					
						
						|  |  | 
					
						
						|  | def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which): | 
					
						
						|  | if select_which == "上傳檔案" and prompt_wav_upload is not None: | 
					
						
						|  | prompt_wav = prompt_wav_upload | 
					
						
						|  | elif select_which == "麥克風" and prompt_wav_record is not None: | 
					
						
						|  | prompt_wav = prompt_wav_record | 
					
						
						|  | else: | 
					
						
						|  | prompt_wav = None | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) | 
					
						
						|  | set_all_random_seed(seed) | 
					
						
						|  | output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k) | 
					
						
						|  | speed_factor = 1 | 
					
						
						|  | if speed_factor != 1.0: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | new_length = int(len(output['tts_speech']) / speed_factor) | 
					
						
						|  | audio_data = resample(output['tts_speech'], new_length) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  | audio_data = output['tts_speech'].numpy().flatten() | 
					
						
						|  |  | 
					
						
						|  | return (target_sr, audio_data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def generate_text(prompt_wav_upload, prompt_wav_record, select_which): | 
					
						
						|  |  | 
					
						
						|  | if select_which == "上傳檔案" and prompt_wav_upload is not None: | 
					
						
						|  | prompt_wav = prompt_wav_upload | 
					
						
						|  | LAST_UPLOADED = "upload" | 
					
						
						|  | elif select_which == "麥克風" and prompt_wav_record is not None: | 
					
						
						|  | prompt_wav = prompt_wav_record | 
					
						
						|  | LAST_UPLOADED = "record" | 
					
						
						|  | else: | 
					
						
						|  | prompt_wav = None | 
					
						
						|  | LAST_UPLOADED = None | 
					
						
						|  | print(select_which) | 
					
						
						|  |  | 
					
						
						|  | if prompt_wav: | 
					
						
						|  | results = asr_pipeline(prompt_wav) | 
					
						
						|  | return results['text'] | 
					
						
						|  | return "No valid input detected." | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def demo_get_audio(tts_text): | 
					
						
						|  | sample_wav = 'sample.wav' | 
					
						
						|  | speech, sample_rate = torchaudio.load(sample_wav) | 
					
						
						|  |  | 
					
						
						|  | return sample_rate, speech | 
					
						
						|  | def main(): | 
					
						
						|  | with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo: | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown("# BreezyVoice 語音合成系統") | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | """### 僅需5秒語音樣本,就可輸出擬真人聲。""" | 
					
						
						|  | ) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | gr.Image(value="https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground/resolve/main/flowchart.png", interactive=False, scale=3) | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | """#### 此沙盒使用 Huggingface CPU,請預期大於200 秒的推理時間,您可以考慮以下方法加速: | 
					
						
						|  | 1. **強烈建議**複製這個 Space(Duplicate this space),以分散流量! | 
					
						
						|  | 2. 複製至本地GPU執行(請參考[指南](https://huggingface.co/docs/hub/en/spaces-overview))或使用[kaggle](https://www.kaggle.com/code/a24998667/breezyvoice-playground) | 
					
						
						|  | 3. 複製至本地CPU執行(請參考[指南](https://huggingface.co/docs/hub/en/spaces-overview)) | 
					
						
						|  |  | 
					
						
						|  | 為了加快推理速度,g2pw注音標註並未被啟動。 | 
					
						
						|  |  | 
					
						
						|  | 免責聲明:此沙盒在一次性容器地端執行,關閉後檔案將遭到刪除。此沙盒不屬於聯發創新基地,聯發創新基地無法獲得任何使用者輸入。""" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入") | 
					
						
						|  | gr.Markdown("選擇prompt音訊檔案或錄製prompt音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。") | 
					
						
						|  | prompt_wav_upload = gr.Audio( | 
					
						
						|  | sources='upload', | 
					
						
						|  | type='filepath', | 
					
						
						|  | label='選擇prompt音訊檔案(確保取樣率不低於16khz)' | 
					
						
						|  | ) | 
					
						
						|  | prompt_wav_record = gr.Audio( | 
					
						
						|  | sources='microphone', | 
					
						
						|  | type='filepath', | 
					
						
						|  | label='錄製prompt音訊檔案' | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(): | 
					
						
						|  | select_which = gr.Radio(["上傳檔案", "麥克風"], label="音訊來源", interactive=True ) | 
					
						
						|  | with gr.Blocks(): | 
					
						
						|  | prompt_text = gr.Textbox( | 
					
						
						|  | label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)", | 
					
						
						|  | lines=2, | 
					
						
						|  | placeholder="音訊樣本文本" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def a(X): | 
					
						
						|  | return "上傳檔案" | 
					
						
						|  | prompt_wav_upload.change( | 
					
						
						|  | fn=a, | 
					
						
						|  | inputs=[prompt_wav_upload], | 
					
						
						|  | outputs=select_which | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | prompt_wav_record.change( | 
					
						
						|  | fn=lambda recording: "麥克風", | 
					
						
						|  | inputs=[prompt_wav_record], | 
					
						
						|  | outputs=select_which | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | select_which.change( | 
					
						
						|  | fn=generate_text, | 
					
						
						|  | inputs=[prompt_wav_upload, prompt_wav_record, select_which], | 
					
						
						|  | outputs=prompt_text | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown("### 步驟 2.合成文本輸入") | 
					
						
						|  | tts_text = gr.Textbox( | 
					
						
						|  | label="輸入想要合成的文本", | 
					
						
						|  | lines=2, | 
					
						
						|  | placeholder="請輸入想要合成的文本...", | 
					
						
						|  | value="你好,歡迎光臨" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown("### 步驟 3. 合成音訊") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Accordion("進階設定", open=False): | 
					
						
						|  | seed = gr.Number(value=0, label="隨機推理種子") | 
					
						
						|  |  | 
					
						
						|  | seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2") | 
					
						
						|  | speed_factor = 1 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | generate_button = gr.Button("生成音訊") | 
					
						
						|  | audio_output = gr.Audio(label="合成音訊") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | seed_button.click(fn=generate_seed, inputs=[], outputs=seed) | 
					
						
						|  | generate_button.click( | 
					
						
						|  | fn=generate_audio, | 
					
						
						|  | inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which], | 
					
						
						|  | outputs=audio_output | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | demo.queue(max_size=10, default_concurrency_limit=1) | 
					
						
						|  | demo.launch() | 
					
						
						|  |  | 
					
						
						|  | if __name__ == '__main__': | 
					
						
						|  | cosyvoice = CosyVoice('MediaTek-Research/BreezyVoice') | 
					
						
						|  | asr_pipeline = pipeline( | 
					
						
						|  | "automatic-speech-recognition", | 
					
						
						|  | model="openai/whisper-tiny", | 
					
						
						|  | tokenizer="openai/whisper-tiny", | 
					
						
						|  | device=0 | 
					
						
						|  | ) | 
					
						
						|  | sft_spk = cosyvoice.list_avaliable_spks() | 
					
						
						|  | prompt_sr, target_sr = 16000, 22050 | 
					
						
						|  | default_data = np.zeros(target_sr) | 
					
						
						|  | main() | 
					
						
						|  |  |