Spaces:

garyuzair
/

Video-Fx

Running

App Files Files Community

garyuzair commited on Mar 19

Commit

b4d330b

verified ·

1 Parent(s): cf7a061

Upload 7 files

Browse files

Files changed (5) hide show

app.py +27 -9
image_generator.py +45 -27
prompt_generator.py +19 -16
requirements.txt +2 -0
transcriber.py +2 -2

app.py CHANGED Viewed

@@ -212,6 +212,10 @@ def main():
             # Memory optimization settings
             memory_optimization = st.toggle("Enable memory optimization", value=True,
                                          help="Reduce memory usage (recommended for Hugging Face Spaces)")
         # Content settings
         st.markdown("### 🎨 Content")
@@ -219,11 +223,11 @@ def main():
             # New setting for maximum segment duration
             max_segment_duration = st.slider(
                 "Maximum image duration (seconds)",
-                min_value=1.0,
                 max_value=5.0,
-                value=5.0,
                 step=0.5,
-                help="Maximum time each image will stay on screen (5 seconds or less)"
             )
             # Adjust number of segments based on max duration
@@ -317,7 +321,7 @@ def main():
         # Generate a cache key based on the audio file and settings
         audio_bytes = audio_file.getvalue()
-        settings_str = f"{num_segments}_{max_segment_duration}_{animation_type}_{frames_per_animation}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}_{memory_optimization}"
         cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
         # Process button with better styling
@@ -365,10 +369,23 @@ def main():
             try:
                 # Force garbage collection before starting
-                if memory_optimization:
                     gc.collect()
                     torch.cuda.empty_cache() if torch.cuda.is_available() else None
                 # Step 1: Initialize components
                 status_text.text("Initializing components...")
                 status_message.markdown("🔄 **Setting up AI models...**")
@@ -408,9 +425,10 @@ def main():
                     st.warning(f"Error segmenting audio: {str(e)}. Using simplified segmentation.")
                     # Fallback: Create empty segments
                     import numpy as np
-                    audio_segments = [np.zeros(16000) for _ in range(num_segments)]  # 1-second silent segments
-                    total_duration = 5 * num_segments  # Assume 5 seconds per segment
-                    timestamps = [(i*5, min((i+1)*5, i*5+max_segment_duration)) for i in range(num_segments)]
                 progress_bar.progress(15)
@@ -607,7 +625,7 @@ def main():
                     audio_file,
                     segments=transcriptions,
                     timestamps=timestamps,
-                    parallel=parallel_processing and not memory_optimization,  # Disable parallel for memory optimization
                     max_workers=max_workers
                 )

             # Memory optimization settings
             memory_optimization = st.toggle("Enable memory optimization", value=True,
                                          help="Reduce memory usage (recommended for Hugging Face Spaces)")
+            # VRAM optimization settings
+            vram_optimization = st.toggle("Enable VRAM optimization", value=True,
+                                      help="Use techniques to reduce VRAM usage on GPU (highly recommended for Hugging Face)")
         # Content settings
         st.markdown("### 🎨 Content")
             # New setting for maximum segment duration
             max_segment_duration = st.slider(
                 "Maximum image duration (seconds)",
+                min_value=3.0,
                 max_value=5.0,
+                value=4.0,
                 step=0.5,
+                help="Each image will stay on screen between 3-5 seconds for optimal results"
             )
             # Adjust number of segments based on max duration
         # Generate a cache key based on the audio file and settings
         audio_bytes = audio_file.getvalue()
+        settings_str = f"{num_segments}_{max_segment_duration}_{animation_type}_{frames_per_animation}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}_{memory_optimization}_{vram_optimization}"
         cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
         # Process button with better styling
             try:
                 # Force garbage collection before starting
+                if memory_optimization or vram_optimization:
                     gc.collect()
                     torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                # Apply VRAM optimization settings
+                if vram_optimization:
+                    # Set image generator to use VRAM optimization
+                    image_generator.set_vram_optimization(True)
+                    # Set lower inference steps when VRAM optimization is enabled
+                    if inference_steps > 25:
+                        inference_steps = 25
+                    # Use smaller base image size when VRAM optimization is enabled
+                    if base_image_size > 512:
+                        base_image_size = 512
                 # Step 1: Initialize components
                 status_text.text("Initializing components...")
                 status_message.markdown("🔄 **Setting up AI models...**")
                     st.warning(f"Error segmenting audio: {str(e)}. Using simplified segmentation.")
                     # Fallback: Create empty segments
                     import numpy as np
+                    segment_duration = 4.0  # Default to 4-second segments (within 3-5 second range)
+                    audio_segments = [np.zeros(int(16000 * segment_duration)) for _ in range(num_segments)]  # 4-second silent segments
+                    total_duration = segment_duration * num_segments
+                    timestamps = [(i*segment_duration, (i+1)*segment_duration) for i in range(num_segments)]
                 progress_bar.progress(15)
                     audio_file,
                     segments=transcriptions,
                     timestamps=timestamps,
+                    parallel=parallel_processing and not (memory_optimization or vram_optimization),  # Disable parallel for memory/VRAM optimization
                     max_workers=max_workers
                 )

image_generator.py CHANGED Viewed

@@ -12,10 +12,15 @@ class ImageGenerator:
         self.model = None
         self.processor = None
         self.target_size = (512, 512)
-        self.inference_steps = 20
-        self.guidance_scale = 7.5
         self.aspect_ratio = "1:1"  # Default aspect ratio
         self.image_cache = {}
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for image generation"""
@@ -74,37 +79,43 @@ class ImageGenerator:
                     from diffusers import StableDiffusionPipeline
-                    # Use the correct model ID as specified
-                    model_id = "sd-legacy/stable-diffusion-v1-5"
-                    # For CPU-only environments like Hugging Face Spaces free tier
                     self.model = StableDiffusionPipeline.from_pretrained(
                         model_id,
-                        torch_dtype=torch.float32,  # Use float32 for CPU
                         safety_checker=None,        # Disable safety checker for speed
-                        low_cpu_mem_usage=True,     # Optimize for low memory
-                        revision="fp16"             # Use fp16 weights but convert to fp32
                     )
-                    # Optimize for CPU
-                    self.model = self.model.to("cpu")
                     # Enable memory efficient attention
-                    if hasattr(self.model, "enable_attention_slicing"):
-                        self.model.enable_attention_slicing(1)
-                    # Enable sequential CPU offload if available
-                    if hasattr(self.model, "enable_sequential_cpu_offload"):
-                        self.model.enable_sequential_cpu_offload()
-                    # Enable model CPU offloading if available
-                    if hasattr(self.model, "enable_model_cpu_offload"):
                         self.model.enable_model_cpu_offload()
-                    # Use smaller VAE scale factor for memory efficiency
-                    if hasattr(self.model, "vae") and hasattr(self.model.vae, "config"):
-                        if hasattr(self.model.vae.config, "scaling_factor"):
-                            self.model.vae.config.scaling_factor = 0.18215  # Default value, explicitly set
                 except Exception as e:
                     st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
@@ -112,8 +123,13 @@ class ImageGenerator:
         return self.model
-    def generate_image(self, prompt, negative_prompt="blurry, bad quality, distorted, disfigured, low resolution"):
-        """Generate an image from a text prompt"""
         # Generate a cache key based on the prompt and settings
         import hashlib
         cache_key = f"{hashlib.md5(prompt.encode()).hexdigest()}_{self.target_size}_{self.inference_steps}_{self.guidance_scale}_{self.aspect_ratio}"
@@ -137,14 +153,16 @@ class ImageGenerator:
                 gc.collect()
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
-                # Generate the image
                 with torch.no_grad():  # Disable gradient calculation for memory efficiency
-                    # Use lower precision during inference
-                    with torch.autocast("cpu"):
                         image = model(
                             prompt=enhanced_prompt,
                             negative_prompt=negative_prompt,
-                            num_inference_steps=self.inference_steps,
                             guidance_scale=self.guidance_scale,
                             width=self.target_size[0],
                             height=self.target_size[1]

         self.model = None
         self.processor = None
         self.target_size = (512, 512)
+        self.inference_steps = 30  # Increased for better quality
+        self.guidance_scale = 8.5  # Increased for better adherence to prompt
         self.aspect_ratio = "1:1"  # Default aspect ratio
         self.image_cache = {}
+        self.vram_optimization = False  # Default to no VRAM optimization
+    def set_vram_optimization(self, enabled):
+        """Enable or disable VRAM optimization techniques"""
+        self.vram_optimization = enabled
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for image generation"""
                     from diffusers import StableDiffusionPipeline
+                    # Use a more reliable model ID
+                    model_id = "stabilityai/stable-diffusion-2-1"
+                    # Optimize for Hugging Face Spaces with memory constraints
                     self.model = StableDiffusionPipeline.from_pretrained(
                         model_id,
+                        torch_dtype=torch.float16,  # Use float16 for memory efficiency
                         safety_checker=None,        # Disable safety checker for speed
+                        variant="fp16",             # Use fp16 variant
+                        use_safetensors=True        # Use safetensors for better memory usage
                     )
+                    # Use CUDA if available, otherwise CPU
+                    device = "cuda" if torch.cuda.is_available() else "cpu"
+                    self.model = self.model.to(device)
                     # Enable memory efficient attention
+                    self.model.enable_attention_slicing()
+                    # Enable xformers attention if available for better memory efficiency
+                    try:
+                        import xformers
+                        self.model.enable_xformers_memory_efficient_attention()
+                    except (ImportError, AttributeError):
+                        pass
+                    # Enable model CPU offloading if on CPU
+                    if device == "cpu" and hasattr(self.model, "enable_model_cpu_offload"):
                         self.model.enable_model_cpu_offload()
+                    # Enable sequential CPU offload if on CPU
+                    if device == "cpu" and hasattr(self.model, "enable_sequential_cpu_offload"):
+                        self.model.enable_sequential_cpu_offload()
+                    # Use tiled VAE for larger images with less memory
+                    if hasattr(self.model, "vae") and hasattr(self.model.vae, "enable_tiling"):
+                        self.model.vae.enable_tiling()
                 except Exception as e:
                     st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
         return self.model
+    def generate_image(self, prompt, negative_prompt="blurry, bad quality, distorted, disfigured, low resolution, worst quality, deformed"):
+        """Generate an image from a text prompt with optimized settings"""
+        # Apply VRAM optimization if enabled
+        inference_steps = self.inference_steps
+        if self.vram_optimization:
+            # Reduce inference steps for VRAM optimization
+            inference_steps = min(inference_steps, 25)
         # Generate a cache key based on the prompt and settings
         import hashlib
         cache_key = f"{hashlib.md5(prompt.encode()).hexdigest()}_{self.target_size}_{self.inference_steps}_{self.guidance_scale}_{self.aspect_ratio}"
                 gc.collect()
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                # Generate the image with optimized settings
                 with torch.no_grad():  # Disable gradient calculation for memory efficiency
+                    # Use autocast for the appropriate device
+                    device = "cuda" if torch.cuda.is_available() else "cpu"
+                    with torch.autocast(device):
+                        # Generate image with better settings
                         image = model(
                             prompt=enhanced_prompt,
                             negative_prompt=negative_prompt,
+                            num_inference_steps=inference_steps,  # Use optimized inference steps
                             guidance_scale=self.guidance_scale,
                             width=self.target_size[0],
                             height=self.target_size[1]

prompt_generator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import streamlit as st
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 class PromptGenerator:
     def __init__(self):
@@ -13,17 +13,17 @@ class PromptGenerator:
         if self.model is None:
             with st.spinner("Loading text-to-prompt model..."):
                 try:
-                    # Using a lightweight model for prompt generation
-                    model_name = "distilgpt2"
                     # Load tokenizer and model separately to avoid device issues
                     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                    # Load model with specific device placement to avoid meta tensor issues
-                    self.model = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         low_cpu_mem_usage=True,
-                        torch_dtype=torch.float32
                     )
                     # Explicitly move to CPU to avoid meta tensor issues
@@ -37,7 +37,7 @@ class PromptGenerator:
         return self.model, self.tokenizer
     def generate_hyper_realistic_prompt(self, transcription, aspect_ratio="16:9"):
-        """Generate a hyper-realistic prompt from a transcription with cinematic quality"""
         # Check cache first
         import hashlib
         cache_key = hashlib.md5((transcription + aspect_ratio).encode()).hexdigest()
@@ -133,26 +133,29 @@ class PromptGenerator:
             if model is not None and tokenizer is not None:
                 # Create a prompt template focused on visual elements
-                template = f"Create a hyper-realistic visual scene for: '{base_prompt}'"
-                # Tokenize
-                inputs = tokenizer(template, return_tensors="pt")
-                # Generate with minimal tokens to save resources
                 with torch.no_grad():
                     outputs = model.generate(
                         inputs["input_ids"],
-                        max_new_tokens=30,
                         num_return_sequences=1,
-                        pad_token_id=tokenizer.eos_token_id
                     )
                 # Decode the generated text
                 generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-                generated_text = generated_text.replace(template, "").strip()
-                # Create an optimized prompt with style keywords
-                scene_description = f"{base_prompt} {generated_text}"
             else:
                 # Fallback method using the base prompt
                 scene_description = base_prompt

 import streamlit as st
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 class PromptGenerator:
     def __init__(self):
         if self.model is None:
             with st.spinner("Loading text-to-prompt model..."):
                 try:
+                    # Using BART model for better prompt enhancement
+                    model_name = "facebook/bart-large-cnn"
                     # Load tokenizer and model separately to avoid device issues
                     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                    # Load model with optimizations for memory efficiency
+                    self.model = AutoModelForSeq2SeqLM.from_pretrained(
                         model_name,
                         low_cpu_mem_usage=True,
+                        torch_dtype=torch.float16
                     )
                     # Explicitly move to CPU to avoid meta tensor issues
         return self.model, self.tokenizer
     def generate_hyper_realistic_prompt(self, transcription, aspect_ratio="16:9"):
+        """Generate a hyper-realistic prompt from a transcription with cinematic quality using BART model"""
         # Check cache first
         import hashlib
         cache_key = hashlib.md5((transcription + aspect_ratio).encode()).hexdigest()
             if model is not None and tokenizer is not None:
                 # Create a prompt template focused on visual elements
+                template = f"Transform this text into a detailed visual description for image generation: {base_prompt}"
+                # Tokenize for seq2seq model
+                inputs = tokenizer(template, return_tensors="pt", max_length=512, truncation=True)
+                # Generate with improved parameters for better descriptions
                 with torch.no_grad():
                     outputs = model.generate(
                         inputs["input_ids"],
+                        max_length=150,  # Allow longer outputs for better descriptions
+                        min_length=50,   # Ensure substantial descriptions
+                        num_beams=4,     # Beam search for better quality
+                        no_repeat_ngram_size=3,  # Avoid repetition
                         num_return_sequences=1,
+                        early_stopping=True
                     )
                 # Decode the generated text
                 generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # No need to replace template as seq2seq models generate new text
+                # Use the BART-generated description directly as it's more comprehensive
+                scene_description = generated_text
             else:
                 # Fallback method using the base prompt
                 scene_description = base_prompt

requirements.txt CHANGED Viewed

@@ -14,3 +14,5 @@ soundfile==0.12.1
 huggingface-hub==0.16.4
 ftfy==6.1.1
 regex==2023.6.3

 huggingface-hub==0.16.4
 ftfy==6.1.1
 regex==2023.6.3
+safetensors==0.3.1
+xformers==0.0.20

transcriber.py CHANGED Viewed

@@ -38,8 +38,8 @@ class AudioTranscriber:
         return self.model
-    def segment_audio(self, audio_file, num_segments=5, min_segment_duration=1.0):
-        """Segment the audio file into chunks for processing with maximum duration limit"""
         # Save the uploaded audio to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             tmp_file.write(audio_file.getvalue())

         return self.model
+    def segment_audio(self, audio_file, num_segments=5, min_segment_duration=3.0):
+        """Segment the audio file into chunks for processing with minimum 3-second and maximum 5-second duration"""
         # Save the uploaded audio to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             tmp_file.write(audio_file.getvalue())