Spaces:

sheep52031
/

breezyvoice-tts

Runtime error

App Files Files Community

sheep52031 commited on Sep 4

Commit

f7e2011

verified ·

1 Parent(s): 94e4002

🔧 修復語音克隆功能 - 使用真正的 BreezyVoice 推論邏輯

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -127,28 +127,29 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
                 # 直接使用 cosyvoice 推論，跳過複雜的文字處理
                 print("🔄 執行語音合成推論...")
-                # 使用基本的 zero-shot 推論
-                output = cosyvoice.inference_zero_shot(
                     content_text,
                     speaker_transcription,
                     prompt_speech_16k
                 )
                 # 保存輸出音訊
-                if output is not None and len(output) > 0:
-                    # output 是 tensor，需要轉換為音訊檔案
-                    torchaudio.save(output_audio_path, output[0].cpu(), 22050)
                 synthesis_time = time.time() - synthesis_start
                 # 檢查輸出
                 if os.path.exists(output_audio_path):
                     # 讀取合成的音訊
-                    synthesized_audio, sample_rate = torchaudio.load(output_audio_path)
                     synthesized_audio = synthesized_audio.numpy()
-                    # 計算音訊長度
-                    audio_duration = synthesized_audio.shape[1] / sample_rate
                     rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
                     # 檢查 VRAM 使用
@@ -168,7 +169,7 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
 {vram_info}
 🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
-                    return (sample_rate, synthesized_audio[0]), status
                 else:
                     return None, "❌ 語音合成失敗：未生成輸出檔案"

                 # 直接使用 cosyvoice 推論，跳過複雜的文字處理
                 print("🔄 執行語音合成推論...")
+                # 使用基本的 zero-shot 推論 (no_normalize 版本)
+                output = cosyvoice.inference_zero_shot_no_normalize(
                     content_text,
                     speaker_transcription,
                     prompt_speech_16k
                 )
                 # 保存輸出音訊
+                if output is not None and 'tts_speech' in output:
+                    # output 是字典 {'tts_speech': tensor}
+                    tts_speech = output['tts_speech']
+                    torchaudio.save(output_audio_path, tts_speech, 22050)
                 synthesis_time = time.time() - synthesis_start
                 # 檢查輸出
                 if os.path.exists(output_audio_path):
                     # 讀取合成的音訊
+                    synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
                     synthesized_audio = synthesized_audio.numpy()
+                    # 計算音訊長度 (使用檔案的實際採樣率)
+                    audio_duration = synthesized_audio.shape[1] / file_sample_rate
                     rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
                     # 檢查 VRAM 使用
 {vram_info}
 🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
+                    return (file_sample_rate, synthesized_audio[0]), status
                 else:
                     return None, "❌ 語音合成失敗：未生成輸出檔案"