Spaces:
Sleeping
Sleeping
Fix audio backend error with robust fallback mechanisms
Browse files- Replace torch-based audio generation with numpy for better compatibility
- Add multiple audio saving fallbacks: torchaudio → wave module → silence
- Use standard 44.1kHz sample rate for better compatibility
- Improve error handling with detailed logging
- Add final silence fallback if all audio generation fails
- Tested and verified audio generation works correctly
Resolves: "Couldn't find appropriate backend to handle uri" error
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
app.py
CHANGED
|
@@ -10,6 +10,8 @@ import json
|
|
| 10 |
import time
|
| 11 |
import base64
|
| 12 |
from io import BytesIO
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
|
| 15 |
"""直接调用 Hugging Face 推理 API"""
|
|
@@ -128,39 +130,85 @@ def call_gradio_client_api(video_file_path: str, text_prompt: str = "") -> Tuple
|
|
| 128 |
|
| 129 |
def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
|
| 130 |
"""创建备用演示音频(当 API 不可用时)"""
|
| 131 |
-
sample_rate =
|
| 132 |
duration = 5.0
|
| 133 |
duration_samples = int(duration * sample_rate)
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
|
| 166 |
"""使用多种 API 方法处理视频"""
|
|
|
|
| 10 |
import time
|
| 11 |
import base64
|
| 12 |
from io import BytesIO
|
| 13 |
+
import numpy as np
|
| 14 |
+
import wave
|
| 15 |
|
| 16 |
def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
|
| 17 |
"""直接调用 Hugging Face 推理 API"""
|
|
|
|
| 130 |
|
| 131 |
def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
|
| 132 |
"""创建备用演示音频(当 API 不可用时)"""
|
| 133 |
+
sample_rate = 44100 # 使用更标准的采样率
|
| 134 |
duration = 5.0
|
| 135 |
duration_samples = int(duration * sample_rate)
|
| 136 |
|
| 137 |
+
try:
|
| 138 |
+
# 使用 numpy 生成音频(避免 torch 依赖问题)
|
| 139 |
+
t = np.linspace(0, duration, duration_samples, dtype=np.float32)
|
| 140 |
+
|
| 141 |
+
# 根据文本内容生成不同类型的音频
|
| 142 |
+
if "footsteps" in text_prompt.lower() or "步" in text_prompt:
|
| 143 |
+
# 脚步声:低频节拍
|
| 144 |
+
audio = 0.4 * np.sin(2 * np.pi * 2 * t) * np.exp(-3 * (t % 0.5))
|
| 145 |
+
elif "rain" in text_prompt.lower() or "雨" in text_prompt:
|
| 146 |
+
# 雨声:白噪声
|
| 147 |
+
audio = 0.3 * np.random.randn(duration_samples)
|
| 148 |
+
elif "wind" in text_prompt.lower() or "风" in text_prompt:
|
| 149 |
+
# 风声:低频噪声
|
| 150 |
+
audio = 0.3 * np.sin(2 * np.pi * 0.5 * t) + 0.2 * np.random.randn(duration_samples)
|
| 151 |
+
elif "car" in text_prompt.lower() or "车" in text_prompt:
|
| 152 |
+
# 车辆声:混合频率
|
| 153 |
+
audio = 0.3 * np.sin(2 * np.pi * 80 * t) + 0.2 * np.sin(2 * np.pi * 120 * t)
|
| 154 |
+
else:
|
| 155 |
+
# 默认:和谐音调
|
| 156 |
+
base_freq = 220 + len(text_prompt) * 5
|
| 157 |
+
audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
|
| 158 |
+
# 添加泛音
|
| 159 |
+
audio += 0.1 * np.sin(2 * np.pi * base_freq * 2 * t)
|
| 160 |
+
audio += 0.05 * np.sin(2 * np.pi * base_freq * 3 * t)
|
| 161 |
+
|
| 162 |
+
# 应用包络以避免突然开始/结束
|
| 163 |
+
envelope = np.ones_like(audio)
|
| 164 |
+
fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
|
| 165 |
+
envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
|
| 166 |
+
envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
|
| 167 |
+
audio *= envelope
|
| 168 |
+
|
| 169 |
+
# 保存音频文件
|
| 170 |
+
temp_dir = tempfile.mkdtemp()
|
| 171 |
+
audio_path = os.path.join(temp_dir, "fallback_audio.wav")
|
| 172 |
+
|
| 173 |
+
# 尝试 torchaudio 保存
|
| 174 |
+
try:
|
| 175 |
+
audio_tensor = torch.from_numpy(audio).unsqueeze(0)
|
| 176 |
+
torchaudio.save(audio_path, audio_tensor, sample_rate)
|
| 177 |
+
logger.info("✅ 使用 torchaudio 保存音频成功")
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.warning(f"torchaudio 保存失败: {e}")
|
| 180 |
+
# 备用方法:使用 Python 内置的 wave 模块
|
| 181 |
+
logger.info("使用 wave 模块保存音频...")
|
| 182 |
+
|
| 183 |
+
# 规范化音频到 int16 范围
|
| 184 |
+
audio_normalized = np.clip(audio, -1.0, 1.0)
|
| 185 |
+
audio_int16 = (audio_normalized * 32767).astype(np.int16)
|
| 186 |
+
|
| 187 |
+
with wave.open(audio_path, 'w') as wav_file:
|
| 188 |
+
wav_file.setnchannels(1) # 单声道
|
| 189 |
+
wav_file.setsampwidth(2) # 16-bit
|
| 190 |
+
wav_file.setframerate(sample_rate)
|
| 191 |
+
wav_file.writeframes(audio_int16.tobytes())
|
| 192 |
+
|
| 193 |
+
logger.info("✅ 使用 wave 模块保存音频成功")
|
| 194 |
+
|
| 195 |
+
return audio_path
|
| 196 |
+
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.error(f"音频生成失败: {str(e)}")
|
| 199 |
+
# 最终备用方案:创建一个简单的静音文件
|
| 200 |
+
temp_dir = tempfile.mkdtemp()
|
| 201 |
+
audio_path = os.path.join(temp_dir, "silence.wav")
|
| 202 |
+
|
| 203 |
+
silence = np.zeros(duration_samples, dtype=np.int16)
|
| 204 |
+
with wave.open(audio_path, 'w') as wav_file:
|
| 205 |
+
wav_file.setnchannels(1)
|
| 206 |
+
wav_file.setsampwidth(2)
|
| 207 |
+
wav_file.setframerate(sample_rate)
|
| 208 |
+
wav_file.writeframes(silence.tobytes())
|
| 209 |
+
|
| 210 |
+
logger.info("生成静音音频作为最终备用方案")
|
| 211 |
+
return audio_path
|
| 212 |
|
| 213 |
def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
|
| 214 |
"""使用多种 API 方法处理视频"""
|