Spaces:

wzy013
/

hunyuanvideo-foley

Sleeping

wzy013 Claude commited on Sep 2

Commit

d72626f

1 Parent(s): d353b6f

Fix audio backend error with robust fallback mechanisms

- Replace torch-based audio generation with numpy for better compatibility
- Add multiple audio saving fallbacks: torchaudio → wave module → silence
- Use standard 44.1kHz sample rate for better compatibility
- Improve error handling with detailed logging
- Add final silence fallback if all audio generation fails
- Tested and verified audio generation works correctly

Resolves: "Couldn't find appropriate backend to handle uri" error

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +78 -30

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ import json
 import time
 import base64
 from io import BytesIO
 def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
     """直接调用 Hugging Face 推理 API"""
@@ -128,39 +130,85 @@ def call_gradio_client_api(video_file_path: str, text_prompt: str = "") -> Tuple
 def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
     """创建备用演示音频（当 API 不可用时）"""
-    sample_rate = 48000
     duration = 5.0
     duration_samples = int(duration * sample_rate)
-    t = torch.linspace(0, duration, duration_samples)
-    # 根据文本内容生成不同类型的音频
-    if "footsteps" in text_prompt.lower() or "步" in text_prompt:
-        audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
-    elif "rain" in text_prompt.lower() or "雨" in text_prompt:
-        audio = 0.3 * torch.randn(duration_samples)
-    elif "wind" in text_prompt.lower() or "风" in text_prompt:
-        audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
-    elif "car" in text_prompt.lower() or "车" in text_prompt:
-        audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
-    else:
-        base_freq = 220 + len(text_prompt) * 5
-        audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
-        audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
-    # 应用包络
-    envelope = torch.ones_like(audio)
-    fade_samples = int(0.1 * sample_rate)
-    envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
-    envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
-    audio *= envelope
-    # 保存音频
-    temp_dir = tempfile.mkdtemp()
-    audio_path = os.path.join(temp_dir, "fallback_audio.wav")
-    torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
-    return audio_path
 def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
     """使用多种 API 方法处理视频"""

 import time
 import base64
 from io import BytesIO
+import numpy as np
+import wave
 def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
     """直接调用 Hugging Face 推理 API"""
 def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
     """创建备用演示音频（当 API 不可用时）"""
+    sample_rate = 44100  # 使用更标准的采样率
     duration = 5.0
     duration_samples = int(duration * sample_rate)
+    try:
+        # 使用 numpy 生成音频（避免 torch 依赖问题）
+        t = np.linspace(0, duration, duration_samples, dtype=np.float32)
+        # 根据文本内容生成不同类型的音频
+        if "footsteps" in text_prompt.lower() or "步" in text_prompt:
+            # 脚步声：低频节拍
+            audio = 0.4 * np.sin(2 * np.pi * 2 * t) * np.exp(-3 * (t % 0.5))
+        elif "rain" in text_prompt.lower() or "雨" in text_prompt:
+            # 雨声：白噪声
+            audio = 0.3 * np.random.randn(duration_samples)
+        elif "wind" in text_prompt.lower() or "风" in text_prompt:
+            # 风声：低频噪声
+            audio = 0.3 * np.sin(2 * np.pi * 0.5 * t) + 0.2 * np.random.randn(duration_samples)
+        elif "car" in text_prompt.lower() or "车" in text_prompt:
+            # 车辆声：混合频率
+            audio = 0.3 * np.sin(2 * np.pi * 80 * t) + 0.2 * np.sin(2 * np.pi * 120 * t)
+        else:
+            # 默认：和谐音调
+            base_freq = 220 + len(text_prompt) * 5
+            audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
+            # 添加泛音
+            audio += 0.1 * np.sin(2 * np.pi * base_freq * 2 * t)
+            audio += 0.05 * np.sin(2 * np.pi * base_freq * 3 * t)
+        # 应用包络以避免突然开始/结束
+        envelope = np.ones_like(audio)
+        fade_samples = int(0.1 * sample_rate)  # 0.1秒淡入淡出
+        envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
+        envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
+        audio *= envelope
+        # 保存音频文件
+        temp_dir = tempfile.mkdtemp()
+        audio_path = os.path.join(temp_dir, "fallback_audio.wav")
+        # 尝试 torchaudio 保存
+        try:
+            audio_tensor = torch.from_numpy(audio).unsqueeze(0)
+            torchaudio.save(audio_path, audio_tensor, sample_rate)
+            logger.info("✅ 使用 torchaudio 保存音频成功")
+        except Exception as e:
+            logger.warning(f"torchaudio 保存失败: {e}")
+            # 备用方法：使用 Python 内置的 wave 模块
+            logger.info("使用 wave 模块保存音频...")
+            # 规范化音频到 int16 范围
+            audio_normalized = np.clip(audio, -1.0, 1.0)
+            audio_int16 = (audio_normalized * 32767).astype(np.int16)
+            with wave.open(audio_path, 'w') as wav_file:
+                wav_file.setnchannels(1)  # 单声道
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(sample_rate)
+                wav_file.writeframes(audio_int16.tobytes())
+            logger.info("✅ 使用 wave 模块保存音频成功")
+        return audio_path
+    except Exception as e:
+        logger.error(f"音频生成失败: {str(e)}")
+        # 最终备用方案：创建一个简单的静音文件
+        temp_dir = tempfile.mkdtemp()
+        audio_path = os.path.join(temp_dir, "silence.wav")
+        silence = np.zeros(duration_samples, dtype=np.int16)
+        with wave.open(audio_path, 'w') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(silence.tobytes())
+        logger.info("生成静音音频作为最终备用方案")
+        return audio_path
 def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
     """使用多种 API 方法处理视频"""