Spaces:
Runtime error
Runtime error
🔧 修復語音克隆功能 - 使用真正的 BreezyVoice 推論邏輯
Browse files
app.py
CHANGED
|
@@ -127,28 +127,29 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
|
| 127 |
# 直接使用 cosyvoice 推論,跳過複雜的文字處理
|
| 128 |
print("🔄 執行語音合成推論...")
|
| 129 |
|
| 130 |
-
# 使用基本的 zero-shot 推論
|
| 131 |
-
output = cosyvoice.
|
| 132 |
content_text,
|
| 133 |
speaker_transcription,
|
| 134 |
prompt_speech_16k
|
| 135 |
)
|
| 136 |
|
| 137 |
# 保存輸出音訊
|
| 138 |
-
if output is not None and
|
| 139 |
-
# output
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
synthesis_time = time.time() - synthesis_start
|
| 143 |
|
| 144 |
# 檢查輸出
|
| 145 |
if os.path.exists(output_audio_path):
|
| 146 |
# 讀取合成的音訊
|
| 147 |
-
synthesized_audio,
|
| 148 |
synthesized_audio = synthesized_audio.numpy()
|
| 149 |
|
| 150 |
-
# 計算音訊長度
|
| 151 |
-
audio_duration = synthesized_audio.shape[1] /
|
| 152 |
rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
|
| 153 |
|
| 154 |
# 檢查 VRAM 使用
|
|
@@ -168,7 +169,7 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
|
|
| 168 |
{vram_info}
|
| 169 |
🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
|
| 170 |
|
| 171 |
-
return (
|
| 172 |
else:
|
| 173 |
return None, "❌ 語音合成失敗:未生成輸出檔案"
|
| 174 |
|
|
|
|
| 127 |
# 直接使用 cosyvoice 推論,跳過複雜的文字處理
|
| 128 |
print("🔄 執行語音合成推論...")
|
| 129 |
|
| 130 |
+
# 使用基本的 zero-shot 推論 (no_normalize 版本)
|
| 131 |
+
output = cosyvoice.inference_zero_shot_no_normalize(
|
| 132 |
content_text,
|
| 133 |
speaker_transcription,
|
| 134 |
prompt_speech_16k
|
| 135 |
)
|
| 136 |
|
| 137 |
# 保存輸出音訊
|
| 138 |
+
if output is not None and 'tts_speech' in output:
|
| 139 |
+
# output 是字典 {'tts_speech': tensor}
|
| 140 |
+
tts_speech = output['tts_speech']
|
| 141 |
+
torchaudio.save(output_audio_path, tts_speech, 22050)
|
| 142 |
|
| 143 |
synthesis_time = time.time() - synthesis_start
|
| 144 |
|
| 145 |
# 檢查輸出
|
| 146 |
if os.path.exists(output_audio_path):
|
| 147 |
# 讀取合成的音訊
|
| 148 |
+
synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
|
| 149 |
synthesized_audio = synthesized_audio.numpy()
|
| 150 |
|
| 151 |
+
# 計算音訊長度 (使用檔案的實際採樣率)
|
| 152 |
+
audio_duration = synthesized_audio.shape[1] / file_sample_rate
|
| 153 |
rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
|
| 154 |
|
| 155 |
# 檢查 VRAM 使用
|
|
|
|
| 169 |
{vram_info}
|
| 170 |
🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
|
| 171 |
|
| 172 |
+
return (file_sample_rate, synthesized_audio[0]), status
|
| 173 |
else:
|
| 174 |
return None, "❌ 語音合成失敗:未生成輸出檔案"
|
| 175 |
|