Spaces:
Running
Running
| import os | |
| import tempfile | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from loguru import logger | |
| from typing import Optional, Tuple | |
| import random | |
| import numpy as np | |
| import requests | |
| import json | |
| # Simplified working version without loading large models | |
| def create_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str: | |
| """Create a simple demo audio file""" | |
| sample_rate = 48000 | |
| duration_samples = int(duration * sample_rate) | |
| # Generate a simple tone as demo | |
| t = torch.linspace(0, duration, duration_samples) | |
| frequency = 440 # A note | |
| audio = 0.3 * torch.sin(2 * 3.14159 * frequency * t) | |
| # Add some variation based on text prompt length | |
| if text_prompt: | |
| freq_mod = len(text_prompt) * 10 | |
| audio += 0.1 * torch.sin(2 * 3.14159 * freq_mod * t) | |
| # Save to temporary file | |
| temp_dir = tempfile.mkdtemp() | |
| audio_path = os.path.join(temp_dir, "demo_audio.wav") | |
| torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate) | |
| return audio_path | |
| def process_video_demo(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]: | |
| """Working demo version that generates simple audio""" | |
| if video_file is None: | |
| return [], "β Please upload a video file!" | |
| if text_prompt is None: | |
| text_prompt = "" | |
| try: | |
| logger.info(f"Processing video in demo mode: {video_file}") | |
| logger.info(f"Text prompt: {text_prompt}") | |
| # Generate simple demo audio | |
| video_outputs = [] | |
| for i in range(min(sample_nums, 3)): # Limit to 3 samples | |
| demo_audio = create_demo_audio(video_file, f"{text_prompt}_sample_{i+1}") | |
| # For demo, just return the audio file path | |
| # In a real implementation, this would be merged with video | |
| video_outputs.append(demo_audio) | |
| success_msg = f"""β Demo Generation Complete! | |
| πΉ **Processed**: {os.path.basename(video_file) if hasattr(video_file, 'name') else 'Video file'} | |
| π **Prompt**: "{text_prompt}" | |
| βοΈ **Settings**: CFG={guidance_scale}, Steps={inference_steps}, Samples={sample_nums} | |
| π΅ **Generated**: {len(video_outputs)} demo audio sample(s) | |
| β οΈ **Note**: This is a working demo with synthetic audio. | |
| For real AI-generated Foley audio, run locally with the full model: | |
| https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley""" | |
| return video_outputs, success_msg | |
| except Exception as e: | |
| logger.error(f"Demo processing failed: {str(e)}") | |
| return [], f"β Demo processing failed: {str(e)}" | |
| def create_working_interface(): | |
| """Create a working Gradio interface""" | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
| } | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 20px; | |
| margin-bottom: 2rem; | |
| color: white; | |
| } | |
| .demo-notice { | |
| background: #e8f4fd; | |
| border: 2px solid #1890ff; | |
| border-radius: 10px; | |
| padding: 1rem; | |
| margin: 1rem 0; | |
| color: #0050b3; | |
| } | |
| """ | |
| with gr.Blocks(css=css, title="HunyuanVideo-Foley Demo") as app: | |
| # Header | |
| with gr.Column(elem_classes=["main-header"]): | |
| gr.HTML(""" | |
| <h1>π΅ HunyuanVideo-Foley</h1> | |
| <p>Working Demo Version</p> | |
| """) | |
| # Demo Notice | |
| gr.HTML(""" | |
| <div class="demo-notice"> | |
| <strong>π― Working Demo:</strong> This version generates synthetic audio to demonstrate the interface. | |
| Upload a video and try the controls to see how it works!<br> | |
| <strong>For real AI audio:</strong> Visit the <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">original repository</a> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # Input Section | |
| with gr.Column(scale=1): | |
| gr.Markdown("### πΉ Video Input") | |
| video_input = gr.Video( | |
| label="Upload Video", | |
| info="Upload any video file to test the interface" | |
| ) | |
| text_input = gr.Textbox( | |
| label="π― Audio Description", | |
| placeholder="Describe the audio you want (affects demo tone)", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| guidance_scale = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=4.0, | |
| step=0.1, | |
| label="ποΈ CFG Scale" | |
| ) | |
| inference_steps = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| value=50, | |
| step=5, | |
| label="β‘ Steps" | |
| ) | |
| sample_nums = gr.Slider( | |
| minimum=1, | |
| maximum=3, | |
| value=1, | |
| step=1, | |
| label="π² Samples" | |
| ) | |
| generate_btn = gr.Button("π΅ Generate Demo Audio", variant="primary") | |
| # Output Section | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π΅ Generated Audio") | |
| audio_output_1 = gr.Audio(label="Sample 1", visible=True) | |
| audio_output_2 = gr.Audio(label="Sample 2", visible=False) | |
| audio_output_3 = gr.Audio(label="Sample 3", visible=False) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=6 | |
| ) | |
| # Event handlers | |
| def update_visibility(sample_nums): | |
| return [ | |
| gr.update(visible=True), # Sample 1 always visible | |
| gr.update(visible=sample_nums >= 2), | |
| gr.update(visible=sample_nums >= 3) | |
| ] | |
| def process_demo(video_file, text_prompt, guidance_scale, inference_steps, sample_nums): | |
| audio_files, status_msg = process_video_demo( | |
| video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums) | |
| ) | |
| # Prepare outputs | |
| outputs = [None, None, None] | |
| for i, audio_file in enumerate(audio_files[:3]): | |
| outputs[i] = audio_file | |
| return outputs[0], outputs[1], outputs[2], status_msg | |
| # Connect events | |
| sample_nums.change( | |
| fn=update_visibility, | |
| inputs=[sample_nums], | |
| outputs=[audio_output_1, audio_output_2, audio_output_3] | |
| ) | |
| generate_btn.click( | |
| fn=process_demo, | |
| inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums], | |
| outputs=[audio_output_1, audio_output_2, audio_output_3, status_output] | |
| ) | |
| # Footer | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 2rem; color: #666;"> | |
| <p>π <strong>Demo Version:</strong> Generates synthetic audio for interface demonstration</p> | |
| <p>π <strong>Full Version:</strong> <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p> | |
| </div> | |
| """) | |
| return app | |
| if __name__ == "__main__": | |
| # Setup logging | |
| logger.remove() | |
| logger.add(lambda msg: print(msg, end=''), level="INFO") | |
| logger.info("Starting HunyuanVideo-Foley Working Demo...") | |
| # Create and launch app | |
| app = create_working_interface() | |
| logger.info("Demo app ready - will generate synthetic audio for testing") | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=False, | |
| show_error=True | |
| ) |