Spaces:
Runtime error
Runtime error
| """ | |
| MediaTek BreezyVoice 真實語音克隆 Space | |
| 基於成功的本地測試實現真正的語音合成功能 | |
| v3.0: 簡化實現避免多進程問題 | |
| """ | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import time | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| # 設置單線程模式避免多進程衝突 | |
| torch.set_num_threads(1) | |
| os.environ['OMP_NUM_THREADS'] = '1' | |
| os.environ['MKL_NUM_THREADS'] = '1' | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' | |
| # 預設參考語音範例 (約20秒朗讀) | |
| DEFAULT_REFERENCE_TEXT = "台灣是個美麗的島嶼,擁有豐富的自然景觀和多元的文化特色。從北部的陽明山到南部的墾丁,每個地方都有獨特的魅力。四季分明的氣候讓這裡的生活充滿變化,春天櫻花盛開,夏天海灘戲水,秋天楓葉飄香,冬天溫泉暖身。" | |
| # 全域變數 | |
| cosyvoice = None | |
| setup_completed = False | |
| def setup_breezyvoice(): | |
| """設置 BreezyVoice 環境並載入模型""" | |
| global cosyvoice, setup_completed | |
| if setup_completed: | |
| return "✅ BreezyVoice 已準備就緒" | |
| try: | |
| print("🔧 正在設置 BreezyVoice...") | |
| # 1. Clone BreezyVoice repository | |
| repo_path = "/tmp/BreezyVoice" | |
| if not os.path.exists(repo_path): | |
| print("📥 下載 BreezyVoice repository...") | |
| result = subprocess.run([ | |
| "git", "clone", | |
| "https://github.com/mtkresearch/BreezyVoice.git", | |
| repo_path | |
| ], capture_output=True, text=True, timeout=300) | |
| if result.returncode != 0: | |
| raise Exception(f"下載失敗: {result.stderr}") | |
| # 2. 添加模組路徑 | |
| sys.path.insert(0, repo_path) | |
| # 3. 導入 BreezyVoice 核心模組 | |
| try: | |
| from single_inference import CustomCosyVoice | |
| print("✅ BreezyVoice 模組導入成功") | |
| except ImportError as e: | |
| raise Exception(f"模組導入失敗: {e}") | |
| # 4. 載入模型 | |
| print("🔄 載入 BreezyVoice 完整版模型...") | |
| cosyvoice = CustomCosyVoice("MediaTek-Research/BreezyVoice") | |
| setup_completed = True | |
| print("✅ BreezyVoice 設置完成!") | |
| # 檢查 VRAM 使用 | |
| if torch.cuda.is_available(): | |
| vram_used = torch.cuda.memory_allocated() / 1024**3 | |
| return f"✅ BreezyVoice 設置完成!VRAM 使用: {vram_used:.2f}GB" | |
| return "✅ BreezyVoice 設置完成!" | |
| except Exception as e: | |
| print(f"❌ 設置失敗: {str(e)}") | |
| return f"❌ 設置失敗: {str(e)}" | |
| def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None): | |
| """執行 BreezyVoice 語音克隆 - 簡化版避免多進程問題""" | |
| global cosyvoice | |
| if speaker_audio is None: | |
| return None, "❌ 請先上傳或錄製參考語音" | |
| if not content_text.strip(): | |
| return None, "❌ 請輸入要合成的文字" | |
| if not setup_completed or cosyvoice is None: | |
| setup_status = setup_breezyvoice() | |
| if "❌" in setup_status: | |
| return None, setup_status | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # 處理輸入音訊 | |
| input_audio_path = os.path.join(temp_dir, "speaker_voice.wav") | |
| output_audio_path = os.path.join(temp_dir, "cloned_voice.wav") | |
| # 保存參考音訊 | |
| sample_rate, audio_data = speaker_audio | |
| torchaudio.save(input_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate) | |
| # 使用參考轉錄或預設值 | |
| if not speaker_transcription or not speaker_transcription.strip(): | |
| speaker_transcription = DEFAULT_REFERENCE_TEXT | |
| print(f"🎤 合成文字: {content_text}") | |
| print(f"📝 參考轉錄: {speaker_transcription}") | |
| # 執行語音合成 - 使用簡化方法避免多進程 | |
| synthesis_start = time.time() | |
| try: | |
| # 導入必要函數 | |
| from cosyvoice.utils.file_utils import load_wav | |
| # 載入音訊 | |
| prompt_speech_16k = load_wav(input_audio_path, 16000) | |
| # 直接使用 cosyvoice 推論,跳過複雜的文字處理 | |
| print("🔄 執行語音合成推論...") | |
| # 使用基本的 zero-shot 推論 (no_normalize 版本) | |
| output = cosyvoice.inference_zero_shot_no_normalize( | |
| content_text, | |
| speaker_transcription, | |
| prompt_speech_16k | |
| ) | |
| # 保存輸出音訊 | |
| if output is not None and 'tts_speech' in output: | |
| # output 是字典 {'tts_speech': tensor} | |
| tts_speech = output['tts_speech'] | |
| torchaudio.save(output_audio_path, tts_speech, 22050) | |
| synthesis_time = time.time() - synthesis_start | |
| # 檢查輸出 | |
| if os.path.exists(output_audio_path): | |
| # 讀取合成的音訊 | |
| synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path) | |
| synthesized_audio = synthesized_audio.numpy() | |
| # 計算音訊長度 (使用檔案的實際採樣率) | |
| audio_duration = synthesized_audio.shape[1] / file_sample_rate | |
| rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf') | |
| # 檢查 VRAM 使用 | |
| vram_info = "" | |
| if torch.cuda.is_available(): | |
| vram_used = torch.cuda.memory_allocated() / 1024**3 | |
| vram_info = f"💾 VRAM: {vram_used:.2f}GB" | |
| status = f"""✅ 語音克隆成功! | |
| 🎙️ 參考語音: {len(audio_data)/sample_rate:.1f}秒 | |
| 📝 合成內容: {content_text} | |
| 📝 使用轉錄: {speaker_transcription[:30]}... | |
| ⏱️ 合成時間: {synthesis_time:.1f}秒 | |
| 🎵 輸出長度: {audio_duration:.1f}秒 | |
| 📊 RTF: {rtf:.3f} {'(實時)' if rtf < 1.0 else '(非實時)'} | |
| {vram_info} | |
| 🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)""" | |
| return (file_sample_rate, synthesized_audio[0]), status | |
| else: | |
| return None, "❌ 語音合成失敗:未生成輸出檔案" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ 語音合成失敗: {str(e)}" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ 處理錯誤: {str(e)}" | |
| def load_example_text(): | |
| """載入預設範例文字""" | |
| return DEFAULT_REFERENCE_TEXT | |
| # 創建 Gradio 界面 | |
| with gr.Blocks(title="BreezyVoice 語音克隆", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎭 MediaTek BreezyVoice 語音克隆") | |
| gr.Markdown("**零樣本語音克隆系統** - 專為台灣繁體中文優化 (簡化版)") | |
| # 初始化狀態顯示 | |
| setup_status = gr.Textbox( | |
| label="🔧 系統狀態", | |
| value="⏳ 準備初始化 BreezyVoice...", | |
| interactive=False | |
| ) | |
| # 初始化按鈕 | |
| init_btn = gr.Button("🚀 初始化 BreezyVoice", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎙️ 步驟 1: 上傳參考語音") | |
| gr.Markdown("請照著下面的範例文字朗讀,上傳 5-20 秒清晰語音") | |
| # 顯示範例文字 | |
| gr.Markdown("#### 📖 建議朗讀範例:") | |
| example_display = gr.Textbox( | |
| value=DEFAULT_REFERENCE_TEXT, | |
| label="請照著這段文字朗讀 (約20秒)", | |
| lines=4, | |
| interactive=False | |
| ) | |
| speaker_audio = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="參考語音錄音 (照著上面文字念)" | |
| ) | |
| gr.Markdown("### 📝 步驟 2: 輸入合成文字") | |
| content_text = gr.Textbox( | |
| lines=3, | |
| placeholder="請輸入要用克隆聲音說出的內容...", | |
| label="合成文字內容", | |
| value="歡迎來到我們的語音合成系統!這個技術可以模仿任何人的聲音,讓文字轉換成自然流暢的語音。" | |
| ) | |
| gr.Markdown("### 🔤 步驟 3: 參考語音轉錄") | |
| speaker_transcription = gr.Textbox( | |
| lines=3, | |
| label="參考語音轉錄 (預設範例)", | |
| value=DEFAULT_REFERENCE_TEXT | |
| ) | |
| # 載入範例按鈕 | |
| load_example_btn = gr.Button("📄 載入預設範例", variant="secondary") | |
| clone_btn = gr.Button("🎭 開始語音克隆", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎵 克隆結果") | |
| result_audio = gr.Audio( | |
| label="克隆的語音", | |
| type="numpy" | |
| ) | |
| result_status = gr.Textbox( | |
| label="📋 處理狀態", | |
| lines=12, | |
| max_lines=15, | |
| interactive=False | |
| ) | |
| # 使用說明 | |
| with gr.Accordion("📖 使用說明", open=False): | |
| gr.Markdown(f""" | |
| ## 🎯 最佳使用方式 | |
| 1. **📖 朗讀範例**: 請照著範例文字清晰朗讀 | |
| 2. **🎙️ 錄音要求**: 5-20 秒,環境安靜,發音清楚 | |
| 3. **✨ 克隆效果**: 系統會用您的聲音說出任何文字 | |
| ## 📝 範例文字內容 | |
| ``` | |
| {DEFAULT_REFERENCE_TEXT} | |
| ``` | |
| ## ⚡ 技術特色 | |
| - 🇹🇼 台灣繁體中文專門優化 | |
| - 🎯 零樣本克隆(無需訓練) | |
| - ⚡ ZeroGPU 加速處理 | |
| - 🔊 MediaTek 先進語音合成技術 | |
| ## 💡 版本說明 | |
| - **v3.0 簡化版**: 避免多進程問題,使用基本推論方法 | |
| - 參考語音與轉錄文字匹配度越高,克隆效果越好 | |
| - 建議使用提供的預設範例文字進行錄音 | |
| """) | |
| # 事件綁定 | |
| init_btn.click( | |
| fn=setup_breezyvoice, | |
| outputs=[setup_status] | |
| ) | |
| load_example_btn.click( | |
| fn=load_example_text, | |
| outputs=[speaker_transcription] | |
| ) | |
| clone_btn.click( | |
| fn=breezy_voice_clone, | |
| inputs=[speaker_audio, content_text, speaker_transcription], | |
| outputs=[result_audio, result_status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |