Spaces:
Runtime error
Runtime error
🔧 修復語音克隆功能 - 使用真正的 BreezyVoice 推論邏輯
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
MediaTek BreezyVoice 真實語音克隆 Space
|
| 3 |
基於成功的本地測試實現真正的語音合成功能
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
@@ -27,13 +27,12 @@ DEFAULT_REFERENCE_TEXT = "台灣是個美麗的島嶼,擁有豐富的自然景
|
|
| 27 |
|
| 28 |
# 全域變數
|
| 29 |
cosyvoice = None
|
| 30 |
-
bopomofo_converter = None
|
| 31 |
setup_completed = False
|
| 32 |
|
| 33 |
@spaces.GPU(duration=300)
|
| 34 |
def setup_breezyvoice():
|
| 35 |
"""設置 BreezyVoice 環境並載入模型"""
|
| 36 |
-
global cosyvoice,
|
| 37 |
|
| 38 |
if setup_completed:
|
| 39 |
return "✅ BreezyVoice 已準備就緒"
|
|
@@ -57,21 +56,16 @@ def setup_breezyvoice():
|
|
| 57 |
# 2. 添加模組路徑
|
| 58 |
sys.path.insert(0, repo_path)
|
| 59 |
|
| 60 |
-
# 3.
|
| 61 |
-
print("📦 檢查依賴...")
|
| 62 |
-
|
| 63 |
-
# 4. 導入 BreezyVoice 模組
|
| 64 |
try:
|
| 65 |
from single_inference import CustomCosyVoice
|
| 66 |
-
from g2pw import G2PWConverter
|
| 67 |
print("✅ BreezyVoice 模組導入成功")
|
| 68 |
except ImportError as e:
|
| 69 |
raise Exception(f"模組導入失敗: {e}")
|
| 70 |
|
| 71 |
-
#
|
| 72 |
print("🔄 載入 BreezyVoice 完整版模型...")
|
| 73 |
cosyvoice = CustomCosyVoice("MediaTek-Research/BreezyVoice")
|
| 74 |
-
bopomofo_converter = G2PWConverter()
|
| 75 |
|
| 76 |
setup_completed = True
|
| 77 |
print("✅ BreezyVoice 設置完成!")
|
|
@@ -89,8 +83,8 @@ def setup_breezyvoice():
|
|
| 89 |
|
| 90 |
@spaces.GPU(duration=180)
|
| 91 |
def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
| 92 |
-
"""執行 BreezyVoice 語音克隆"""
|
| 93 |
-
global cosyvoice
|
| 94 |
|
| 95 |
if speaker_audio is None:
|
| 96 |
return None, "❌ 請先上傳或錄製參考語音"
|
|
@@ -120,27 +114,31 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
|
| 120 |
print(f"🎤 合成文字: {content_text}")
|
| 121 |
print(f"📝 參考轉錄: {speaker_transcription}")
|
| 122 |
|
| 123 |
-
# 執行語音合成
|
| 124 |
synthesis_start = time.time()
|
| 125 |
|
| 126 |
try:
|
| 127 |
-
#
|
| 128 |
-
from
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
multiprocessing.set_start_method('spawn', force=True)
|
| 133 |
|
| 134 |
-
#
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
cosyvoice=cosyvoice,
|
| 140 |
-
bopomofo_converter=bopomofo_converter,
|
| 141 |
-
speaker_prompt_text_transcription=speaker_transcription
|
| 142 |
)
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
synthesis_time = time.time() - synthesis_start
|
| 145 |
|
| 146 |
# 檢查輸出
|
|
@@ -168,16 +166,20 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
|
| 168 |
🎵 輸出長度: {audio_duration:.1f}秒
|
| 169 |
📊 RTF: {rtf:.3f} {'(實時)' if rtf < 1.0 else '(非實時)'}
|
| 170 |
{vram_info}
|
| 171 |
-
🤖 模型: MediaTek BreezyVoice 完整版"""
|
| 172 |
|
| 173 |
return (sample_rate, synthesized_audio[0]), status
|
| 174 |
else:
|
| 175 |
return None, "❌ 語音合成失敗:未生成輸出檔案"
|
| 176 |
|
| 177 |
except Exception as e:
|
|
|
|
|
|
|
| 178 |
return None, f"❌ 語音合成失敗: {str(e)}"
|
| 179 |
|
| 180 |
except Exception as e:
|
|
|
|
|
|
|
| 181 |
return None, f"❌ 處理錯誤: {str(e)}"
|
| 182 |
|
| 183 |
def load_example_text():
|
|
@@ -187,7 +189,7 @@ def load_example_text():
|
|
| 187 |
# 創建 Gradio 界面
|
| 188 |
with gr.Blocks(title="BreezyVoice 語音克隆", theme=gr.themes.Soft()) as demo:
|
| 189 |
gr.Markdown("# 🎭 MediaTek BreezyVoice 語音克隆")
|
| 190 |
-
gr.Markdown("**零樣本語音克隆系統** - 專為台灣繁體中文優化")
|
| 191 |
|
| 192 |
# 初始化狀態顯示
|
| 193 |
setup_status = gr.Textbox(
|
|
@@ -273,10 +275,10 @@ with gr.Blocks(title="BreezyVoice 語音克隆", theme=gr.themes.Soft()) as demo
|
|
| 273 |
- ⚡ ZeroGPU 加速處理
|
| 274 |
- 🔊 MediaTek 先進語音合成技術
|
| 275 |
|
| 276 |
-
## 💡
|
|
|
|
| 277 |
- 參考語音與轉錄文字匹配度越高,克隆效果越好
|
| 278 |
- 建議使用提供的預設範例文字進行錄音
|
| 279 |
-
- 錄音時保持自然語調,不需刻意
|
| 280 |
""")
|
| 281 |
|
| 282 |
# 事件綁定
|
|
|
|
| 1 |
"""
|
| 2 |
MediaTek BreezyVoice 真實語音克隆 Space
|
| 3 |
基於成功的本地測試實現真正的語音合成功能
|
| 4 |
+
v3.0: 簡化實現避免多進程問題
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
|
|
| 27 |
|
| 28 |
# 全域變數
|
| 29 |
cosyvoice = None
|
|
|
|
| 30 |
setup_completed = False
|
| 31 |
|
| 32 |
@spaces.GPU(duration=300)
|
| 33 |
def setup_breezyvoice():
|
| 34 |
"""設置 BreezyVoice 環境並載入模型"""
|
| 35 |
+
global cosyvoice, setup_completed
|
| 36 |
|
| 37 |
if setup_completed:
|
| 38 |
return "✅ BreezyVoice 已準備就緒"
|
|
|
|
| 56 |
# 2. 添加模組路徑
|
| 57 |
sys.path.insert(0, repo_path)
|
| 58 |
|
| 59 |
+
# 3. 導入 BreezyVoice 核心模組
|
|
|
|
|
|
|
|
|
|
| 60 |
try:
|
| 61 |
from single_inference import CustomCosyVoice
|
|
|
|
| 62 |
print("✅ BreezyVoice 模組導入成功")
|
| 63 |
except ImportError as e:
|
| 64 |
raise Exception(f"模組導入失敗: {e}")
|
| 65 |
|
| 66 |
+
# 4. 載入模型
|
| 67 |
print("🔄 載入 BreezyVoice 完整版模型...")
|
| 68 |
cosyvoice = CustomCosyVoice("MediaTek-Research/BreezyVoice")
|
|
|
|
| 69 |
|
| 70 |
setup_completed = True
|
| 71 |
print("✅ BreezyVoice 設置完成!")
|
|
|
|
| 83 |
|
| 84 |
@spaces.GPU(duration=180)
|
| 85 |
def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
| 86 |
+
"""執行 BreezyVoice 語音克隆 - 簡化版避免多進程問題"""
|
| 87 |
+
global cosyvoice
|
| 88 |
|
| 89 |
if speaker_audio is None:
|
| 90 |
return None, "❌ 請先上傳或錄製參考語音"
|
|
|
|
| 114 |
print(f"🎤 合成文字: {content_text}")
|
| 115 |
print(f"📝 參考轉錄: {speaker_transcription}")
|
| 116 |
|
| 117 |
+
# 執行語音合成 - 使用簡化方法避免多進程
|
| 118 |
synthesis_start = time.time()
|
| 119 |
|
| 120 |
try:
|
| 121 |
+
# 導入必要函數
|
| 122 |
+
from cosyvoice.utils.file_utils import load_wav
|
| 123 |
+
|
| 124 |
+
# 載入音訊
|
| 125 |
+
prompt_speech_16k = load_wav(input_audio_path, 16000)
|
| 126 |
|
| 127 |
+
# 直接使用 cosyvoice 推論,跳過複雜的文字處理
|
| 128 |
+
print("🔄 執行語音合成推論...")
|
|
|
|
| 129 |
|
| 130 |
+
# 使用基本的 zero-shot 推論
|
| 131 |
+
output = cosyvoice.inference_zero_shot(
|
| 132 |
+
content_text,
|
| 133 |
+
speaker_transcription,
|
| 134 |
+
prompt_speech_16k
|
|
|
|
|
|
|
|
|
|
| 135 |
)
|
| 136 |
|
| 137 |
+
# 保存輸出音訊
|
| 138 |
+
if output is not None and len(output) > 0:
|
| 139 |
+
# output 是 tensor,需要轉換為音訊檔案
|
| 140 |
+
torchaudio.save(output_audio_path, output[0].cpu(), 22050)
|
| 141 |
+
|
| 142 |
synthesis_time = time.time() - synthesis_start
|
| 143 |
|
| 144 |
# 檢查輸出
|
|
|
|
| 166 |
🎵 輸出長度: {audio_duration:.1f}秒
|
| 167 |
📊 RTF: {rtf:.3f} {'(實時)' if rtf < 1.0 else '(非實時)'}
|
| 168 |
{vram_info}
|
| 169 |
+
🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
|
| 170 |
|
| 171 |
return (sample_rate, synthesized_audio[0]), status
|
| 172 |
else:
|
| 173 |
return None, "❌ 語音合成失敗:未生成輸出檔案"
|
| 174 |
|
| 175 |
except Exception as e:
|
| 176 |
+
import traceback
|
| 177 |
+
traceback.print_exc()
|
| 178 |
return None, f"❌ 語音合成失敗: {str(e)}"
|
| 179 |
|
| 180 |
except Exception as e:
|
| 181 |
+
import traceback
|
| 182 |
+
traceback.print_exc()
|
| 183 |
return None, f"❌ 處理錯誤: {str(e)}"
|
| 184 |
|
| 185 |
def load_example_text():
|
|
|
|
| 189 |
# 創建 Gradio 界面
|
| 190 |
with gr.Blocks(title="BreezyVoice 語音克隆", theme=gr.themes.Soft()) as demo:
|
| 191 |
gr.Markdown("# 🎭 MediaTek BreezyVoice 語音克隆")
|
| 192 |
+
gr.Markdown("**零樣本語音克隆系統** - 專為台灣繁體中文優化 (簡化版)")
|
| 193 |
|
| 194 |
# 初始化狀態顯示
|
| 195 |
setup_status = gr.Textbox(
|
|
|
|
| 275 |
- ⚡ ZeroGPU 加速處理
|
| 276 |
- 🔊 MediaTek 先進語音合成技術
|
| 277 |
|
| 278 |
+
## 💡 版本說明
|
| 279 |
+
- **v3.0 簡化版**: 避免多進程問題,使用基本推論方法
|
| 280 |
- 參考語音與轉錄文字匹配度越高,克隆效果越好
|
| 281 |
- 建議使用提供的預設範例文字進行錄音
|
|
|
|
| 282 |
""")
|
| 283 |
|
| 284 |
# 事件綁定
|