sheep52031 commited on
Commit
f7e2011
·
verified ·
1 Parent(s): 94e4002

🔧 修復語音克隆功能 - 使用真正的 BreezyVoice 推論邏輯

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -127,28 +127,29 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
127
  # 直接使用 cosyvoice 推論,跳過複雜的文字處理
128
  print("🔄 執行語音合成推論...")
129
 
130
- # 使用基本的 zero-shot 推論
131
- output = cosyvoice.inference_zero_shot(
132
  content_text,
133
  speaker_transcription,
134
  prompt_speech_16k
135
  )
136
 
137
  # 保存輸出音訊
138
- if output is not None and len(output) > 0:
139
- # output tensor,需要轉換為音訊檔案
140
- torchaudio.save(output_audio_path, output[0].cpu(), 22050)
 
141
 
142
  synthesis_time = time.time() - synthesis_start
143
 
144
  # 檢查輸出
145
  if os.path.exists(output_audio_path):
146
  # 讀取合成的音訊
147
- synthesized_audio, sample_rate = torchaudio.load(output_audio_path)
148
  synthesized_audio = synthesized_audio.numpy()
149
 
150
- # 計算音訊長度
151
- audio_duration = synthesized_audio.shape[1] / sample_rate
152
  rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
153
 
154
  # 檢查 VRAM 使用
@@ -168,7 +169,7 @@ def breezy_voice_clone(speaker_audio, content_text, speaker_transcription=None):
168
  {vram_info}
169
  🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
170
 
171
- return (sample_rate, synthesized_audio[0]), status
172
  else:
173
  return None, "❌ 語音合成失敗:未生成輸出檔案"
174
 
 
127
  # 直接使用 cosyvoice 推論,跳過複雜的文字處理
128
  print("🔄 執行語音合成推論...")
129
 
130
+ # 使用基本的 zero-shot 推論 (no_normalize 版本)
131
+ output = cosyvoice.inference_zero_shot_no_normalize(
132
  content_text,
133
  speaker_transcription,
134
  prompt_speech_16k
135
  )
136
 
137
  # 保存輸出音訊
138
+ if output is not None and 'tts_speech' in output:
139
+ # output 是字典 {'tts_speech': tensor}
140
+ tts_speech = output['tts_speech']
141
+ torchaudio.save(output_audio_path, tts_speech, 22050)
142
 
143
  synthesis_time = time.time() - synthesis_start
144
 
145
  # 檢查輸出
146
  if os.path.exists(output_audio_path):
147
  # 讀取合成的音訊
148
+ synthesized_audio, file_sample_rate = torchaudio.load(output_audio_path)
149
  synthesized_audio = synthesized_audio.numpy()
150
 
151
+ # 計算音訊長度 (使用檔案的實際採樣率)
152
+ audio_duration = synthesized_audio.shape[1] / file_sample_rate
153
  rtf = synthesis_time / audio_duration if audio_duration > 0 else float('inf')
154
 
155
  # 檢查 VRAM 使用
 
169
  {vram_info}
170
  🤖 模型: MediaTek BreezyVoice 完整版 (簡化版)"""
171
 
172
+ return (file_sample_rate, synthesized_audio[0]), status
173
  else:
174
  return None, "❌ 語音合成失敗:未生成輸出檔案"
175