garyuzair commited on
Commit
b4d330b
·
verified ·
1 Parent(s): cf7a061

Upload 7 files

Browse files
Files changed (5) hide show
  1. app.py +27 -9
  2. image_generator.py +45 -27
  3. prompt_generator.py +19 -16
  4. requirements.txt +2 -0
  5. transcriber.py +2 -2
app.py CHANGED
@@ -212,6 +212,10 @@ def main():
212
  # Memory optimization settings
213
  memory_optimization = st.toggle("Enable memory optimization", value=True,
214
  help="Reduce memory usage (recommended for Hugging Face Spaces)")
 
 
 
 
215
 
216
  # Content settings
217
  st.markdown("### 🎨 Content")
@@ -219,11 +223,11 @@ def main():
219
  # New setting for maximum segment duration
220
  max_segment_duration = st.slider(
221
  "Maximum image duration (seconds)",
222
- min_value=1.0,
223
  max_value=5.0,
224
- value=5.0,
225
  step=0.5,
226
- help="Maximum time each image will stay on screen (5 seconds or less)"
227
  )
228
 
229
  # Adjust number of segments based on max duration
@@ -317,7 +321,7 @@ def main():
317
 
318
  # Generate a cache key based on the audio file and settings
319
  audio_bytes = audio_file.getvalue()
320
- settings_str = f"{num_segments}_{max_segment_duration}_{animation_type}_{frames_per_animation}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}_{memory_optimization}"
321
  cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
322
 
323
  # Process button with better styling
@@ -365,10 +369,23 @@ def main():
365
 
366
  try:
367
  # Force garbage collection before starting
368
- if memory_optimization:
369
  gc.collect()
370
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  # Step 1: Initialize components
373
  status_text.text("Initializing components...")
374
  status_message.markdown("🔄 **Setting up AI models...**")
@@ -408,9 +425,10 @@ def main():
408
  st.warning(f"Error segmenting audio: {str(e)}. Using simplified segmentation.")
409
  # Fallback: Create empty segments
410
  import numpy as np
411
- audio_segments = [np.zeros(16000) for _ in range(num_segments)] # 1-second silent segments
412
- total_duration = 5 * num_segments # Assume 5 seconds per segment
413
- timestamps = [(i*5, min((i+1)*5, i*5+max_segment_duration)) for i in range(num_segments)]
 
414
 
415
  progress_bar.progress(15)
416
 
@@ -607,7 +625,7 @@ def main():
607
  audio_file,
608
  segments=transcriptions,
609
  timestamps=timestamps,
610
- parallel=parallel_processing and not memory_optimization, # Disable parallel for memory optimization
611
  max_workers=max_workers
612
  )
613
 
 
212
  # Memory optimization settings
213
  memory_optimization = st.toggle("Enable memory optimization", value=True,
214
  help="Reduce memory usage (recommended for Hugging Face Spaces)")
215
+
216
+ # VRAM optimization settings
217
+ vram_optimization = st.toggle("Enable VRAM optimization", value=True,
218
+ help="Use techniques to reduce VRAM usage on GPU (highly recommended for Hugging Face)")
219
 
220
  # Content settings
221
  st.markdown("### 🎨 Content")
 
223
  # New setting for maximum segment duration
224
  max_segment_duration = st.slider(
225
  "Maximum image duration (seconds)",
226
+ min_value=3.0,
227
  max_value=5.0,
228
+ value=4.0,
229
  step=0.5,
230
+ help="Each image will stay on screen between 3-5 seconds for optimal results"
231
  )
232
 
233
  # Adjust number of segments based on max duration
 
321
 
322
  # Generate a cache key based on the audio file and settings
323
  audio_bytes = audio_file.getvalue()
324
+ settings_str = f"{num_segments}_{max_segment_duration}_{animation_type}_{frames_per_animation}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}_{memory_optimization}_{vram_optimization}"
325
  cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
326
 
327
  # Process button with better styling
 
369
 
370
  try:
371
  # Force garbage collection before starting
372
+ if memory_optimization or vram_optimization:
373
  gc.collect()
374
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
375
 
376
+ # Apply VRAM optimization settings
377
+ if vram_optimization:
378
+ # Set image generator to use VRAM optimization
379
+ image_generator.set_vram_optimization(True)
380
+
381
+ # Set lower inference steps when VRAM optimization is enabled
382
+ if inference_steps > 25:
383
+ inference_steps = 25
384
+
385
+ # Use smaller base image size when VRAM optimization is enabled
386
+ if base_image_size > 512:
387
+ base_image_size = 512
388
+
389
  # Step 1: Initialize components
390
  status_text.text("Initializing components...")
391
  status_message.markdown("🔄 **Setting up AI models...**")
 
425
  st.warning(f"Error segmenting audio: {str(e)}. Using simplified segmentation.")
426
  # Fallback: Create empty segments
427
  import numpy as np
428
+ segment_duration = 4.0 # Default to 4-second segments (within 3-5 second range)
429
+ audio_segments = [np.zeros(int(16000 * segment_duration)) for _ in range(num_segments)] # 4-second silent segments
430
+ total_duration = segment_duration * num_segments
431
+ timestamps = [(i*segment_duration, (i+1)*segment_duration) for i in range(num_segments)]
432
 
433
  progress_bar.progress(15)
434
 
 
625
  audio_file,
626
  segments=transcriptions,
627
  timestamps=timestamps,
628
+ parallel=parallel_processing and not (memory_optimization or vram_optimization), # Disable parallel for memory/VRAM optimization
629
  max_workers=max_workers
630
  )
631
 
image_generator.py CHANGED
@@ -12,10 +12,15 @@ class ImageGenerator:
12
  self.model = None
13
  self.processor = None
14
  self.target_size = (512, 512)
15
- self.inference_steps = 20
16
- self.guidance_scale = 7.5
17
  self.aspect_ratio = "1:1" # Default aspect ratio
18
  self.image_cache = {}
 
 
 
 
 
19
 
20
  def set_aspect_ratio(self, aspect_ratio):
21
  """Set the aspect ratio for image generation"""
@@ -74,37 +79,43 @@ class ImageGenerator:
74
 
75
  from diffusers import StableDiffusionPipeline
76
 
77
- # Use the correct model ID as specified
78
- model_id = "sd-legacy/stable-diffusion-v1-5"
79
 
80
- # For CPU-only environments like Hugging Face Spaces free tier
81
  self.model = StableDiffusionPipeline.from_pretrained(
82
  model_id,
83
- torch_dtype=torch.float32, # Use float32 for CPU
84
  safety_checker=None, # Disable safety checker for speed
85
- low_cpu_mem_usage=True, # Optimize for low memory
86
- revision="fp16" # Use fp16 weights but convert to fp32
87
  )
88
 
89
- # Optimize for CPU
90
- self.model = self.model.to("cpu")
 
91
 
92
  # Enable memory efficient attention
93
- if hasattr(self.model, "enable_attention_slicing"):
94
- self.model.enable_attention_slicing(1)
95
 
96
- # Enable sequential CPU offload if available
97
- if hasattr(self.model, "enable_sequential_cpu_offload"):
98
- self.model.enable_sequential_cpu_offload()
 
 
 
99
 
100
- # Enable model CPU offloading if available
101
- if hasattr(self.model, "enable_model_cpu_offload"):
102
  self.model.enable_model_cpu_offload()
103
 
104
- # Use smaller VAE scale factor for memory efficiency
105
- if hasattr(self.model, "vae") and hasattr(self.model.vae, "config"):
106
- if hasattr(self.model.vae.config, "scaling_factor"):
107
- self.model.vae.config.scaling_factor = 0.18215 # Default value, explicitly set
 
 
 
108
 
109
  except Exception as e:
110
  st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
@@ -112,8 +123,13 @@ class ImageGenerator:
112
 
113
  return self.model
114
 
115
- def generate_image(self, prompt, negative_prompt="blurry, bad quality, distorted, disfigured, low resolution"):
116
- """Generate an image from a text prompt"""
 
 
 
 
 
117
  # Generate a cache key based on the prompt and settings
118
  import hashlib
119
  cache_key = f"{hashlib.md5(prompt.encode()).hexdigest()}_{self.target_size}_{self.inference_steps}_{self.guidance_scale}_{self.aspect_ratio}"
@@ -137,14 +153,16 @@ class ImageGenerator:
137
  gc.collect()
138
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
139
 
140
- # Generate the image
141
  with torch.no_grad(): # Disable gradient calculation for memory efficiency
142
- # Use lower precision during inference
143
- with torch.autocast("cpu"):
 
 
144
  image = model(
145
  prompt=enhanced_prompt,
146
  negative_prompt=negative_prompt,
147
- num_inference_steps=self.inference_steps,
148
  guidance_scale=self.guidance_scale,
149
  width=self.target_size[0],
150
  height=self.target_size[1]
 
12
  self.model = None
13
  self.processor = None
14
  self.target_size = (512, 512)
15
+ self.inference_steps = 30 # Increased for better quality
16
+ self.guidance_scale = 8.5 # Increased for better adherence to prompt
17
  self.aspect_ratio = "1:1" # Default aspect ratio
18
  self.image_cache = {}
19
+ self.vram_optimization = False # Default to no VRAM optimization
20
+
21
+ def set_vram_optimization(self, enabled):
22
+ """Enable or disable VRAM optimization techniques"""
23
+ self.vram_optimization = enabled
24
 
25
  def set_aspect_ratio(self, aspect_ratio):
26
  """Set the aspect ratio for image generation"""
 
79
 
80
  from diffusers import StableDiffusionPipeline
81
 
82
+ # Use a more reliable model ID
83
+ model_id = "stabilityai/stable-diffusion-2-1"
84
 
85
+ # Optimize for Hugging Face Spaces with memory constraints
86
  self.model = StableDiffusionPipeline.from_pretrained(
87
  model_id,
88
+ torch_dtype=torch.float16, # Use float16 for memory efficiency
89
  safety_checker=None, # Disable safety checker for speed
90
+ variant="fp16", # Use fp16 variant
91
+ use_safetensors=True # Use safetensors for better memory usage
92
  )
93
 
94
+ # Use CUDA if available, otherwise CPU
95
+ device = "cuda" if torch.cuda.is_available() else "cpu"
96
+ self.model = self.model.to(device)
97
 
98
  # Enable memory efficient attention
99
+ self.model.enable_attention_slicing()
 
100
 
101
+ # Enable xformers attention if available for better memory efficiency
102
+ try:
103
+ import xformers
104
+ self.model.enable_xformers_memory_efficient_attention()
105
+ except (ImportError, AttributeError):
106
+ pass
107
 
108
+ # Enable model CPU offloading if on CPU
109
+ if device == "cpu" and hasattr(self.model, "enable_model_cpu_offload"):
110
  self.model.enable_model_cpu_offload()
111
 
112
+ # Enable sequential CPU offload if on CPU
113
+ if device == "cpu" and hasattr(self.model, "enable_sequential_cpu_offload"):
114
+ self.model.enable_sequential_cpu_offload()
115
+
116
+ # Use tiled VAE for larger images with less memory
117
+ if hasattr(self.model, "vae") and hasattr(self.model.vae, "enable_tiling"):
118
+ self.model.vae.enable_tiling()
119
 
120
  except Exception as e:
121
  st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
 
123
 
124
  return self.model
125
 
126
+ def generate_image(self, prompt, negative_prompt="blurry, bad quality, distorted, disfigured, low resolution, worst quality, deformed"):
127
+ """Generate an image from a text prompt with optimized settings"""
128
+ # Apply VRAM optimization if enabled
129
+ inference_steps = self.inference_steps
130
+ if self.vram_optimization:
131
+ # Reduce inference steps for VRAM optimization
132
+ inference_steps = min(inference_steps, 25)
133
  # Generate a cache key based on the prompt and settings
134
  import hashlib
135
  cache_key = f"{hashlib.md5(prompt.encode()).hexdigest()}_{self.target_size}_{self.inference_steps}_{self.guidance_scale}_{self.aspect_ratio}"
 
153
  gc.collect()
154
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
155
 
156
+ # Generate the image with optimized settings
157
  with torch.no_grad(): # Disable gradient calculation for memory efficiency
158
+ # Use autocast for the appropriate device
159
+ device = "cuda" if torch.cuda.is_available() else "cpu"
160
+ with torch.autocast(device):
161
+ # Generate image with better settings
162
  image = model(
163
  prompt=enhanced_prompt,
164
  negative_prompt=negative_prompt,
165
+ num_inference_steps=inference_steps, # Use optimized inference steps
166
  guidance_scale=self.guidance_scale,
167
  width=self.target_size[0],
168
  height=self.target_size[1]
prompt_generator.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
  class PromptGenerator:
6
  def __init__(self):
@@ -13,17 +13,17 @@ class PromptGenerator:
13
  if self.model is None:
14
  with st.spinner("Loading text-to-prompt model..."):
15
  try:
16
- # Using a lightweight model for prompt generation
17
- model_name = "distilgpt2"
18
 
19
  # Load tokenizer and model separately to avoid device issues
20
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
21
 
22
- # Load model with specific device placement to avoid meta tensor issues
23
- self.model = AutoModelForCausalLM.from_pretrained(
24
  model_name,
25
  low_cpu_mem_usage=True,
26
- torch_dtype=torch.float32
27
  )
28
 
29
  # Explicitly move to CPU to avoid meta tensor issues
@@ -37,7 +37,7 @@ class PromptGenerator:
37
  return self.model, self.tokenizer
38
 
39
  def generate_hyper_realistic_prompt(self, transcription, aspect_ratio="16:9"):
40
- """Generate a hyper-realistic prompt from a transcription with cinematic quality"""
41
  # Check cache first
42
  import hashlib
43
  cache_key = hashlib.md5((transcription + aspect_ratio).encode()).hexdigest()
@@ -133,26 +133,29 @@ class PromptGenerator:
133
 
134
  if model is not None and tokenizer is not None:
135
  # Create a prompt template focused on visual elements
136
- template = f"Create a hyper-realistic visual scene for: '{base_prompt}'"
137
 
138
- # Tokenize
139
- inputs = tokenizer(template, return_tensors="pt")
140
 
141
- # Generate with minimal tokens to save resources
142
  with torch.no_grad():
143
  outputs = model.generate(
144
  inputs["input_ids"],
145
- max_new_tokens=30,
 
 
 
146
  num_return_sequences=1,
147
- pad_token_id=tokenizer.eos_token_id
148
  )
149
 
150
  # Decode the generated text
151
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
152
- generated_text = generated_text.replace(template, "").strip()
153
 
154
- # Create an optimized prompt with style keywords
155
- scene_description = f"{base_prompt} {generated_text}"
156
  else:
157
  # Fallback method using the base prompt
158
  scene_description = base_prompt
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
 
5
  class PromptGenerator:
6
  def __init__(self):
 
13
  if self.model is None:
14
  with st.spinner("Loading text-to-prompt model..."):
15
  try:
16
+ # Using BART model for better prompt enhancement
17
+ model_name = "facebook/bart-large-cnn"
18
 
19
  # Load tokenizer and model separately to avoid device issues
20
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
21
 
22
+ # Load model with optimizations for memory efficiency
23
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
24
  model_name,
25
  low_cpu_mem_usage=True,
26
+ torch_dtype=torch.float16
27
  )
28
 
29
  # Explicitly move to CPU to avoid meta tensor issues
 
37
  return self.model, self.tokenizer
38
 
39
  def generate_hyper_realistic_prompt(self, transcription, aspect_ratio="16:9"):
40
+ """Generate a hyper-realistic prompt from a transcription with cinematic quality using BART model"""
41
  # Check cache first
42
  import hashlib
43
  cache_key = hashlib.md5((transcription + aspect_ratio).encode()).hexdigest()
 
133
 
134
  if model is not None and tokenizer is not None:
135
  # Create a prompt template focused on visual elements
136
+ template = f"Transform this text into a detailed visual description for image generation: {base_prompt}"
137
 
138
+ # Tokenize for seq2seq model
139
+ inputs = tokenizer(template, return_tensors="pt", max_length=512, truncation=True)
140
 
141
+ # Generate with improved parameters for better descriptions
142
  with torch.no_grad():
143
  outputs = model.generate(
144
  inputs["input_ids"],
145
+ max_length=150, # Allow longer outputs for better descriptions
146
+ min_length=50, # Ensure substantial descriptions
147
+ num_beams=4, # Beam search for better quality
148
+ no_repeat_ngram_size=3, # Avoid repetition
149
  num_return_sequences=1,
150
+ early_stopping=True
151
  )
152
 
153
  # Decode the generated text
154
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
155
+ # No need to replace template as seq2seq models generate new text
156
 
157
+ # Use the BART-generated description directly as it's more comprehensive
158
+ scene_description = generated_text
159
  else:
160
  # Fallback method using the base prompt
161
  scene_description = base_prompt
requirements.txt CHANGED
@@ -14,3 +14,5 @@ soundfile==0.12.1
14
  huggingface-hub==0.16.4
15
  ftfy==6.1.1
16
  regex==2023.6.3
 
 
 
14
  huggingface-hub==0.16.4
15
  ftfy==6.1.1
16
  regex==2023.6.3
17
+ safetensors==0.3.1
18
+ xformers==0.0.20
transcriber.py CHANGED
@@ -38,8 +38,8 @@ class AudioTranscriber:
38
 
39
  return self.model
40
 
41
- def segment_audio(self, audio_file, num_segments=5, min_segment_duration=1.0):
42
- """Segment the audio file into chunks for processing with maximum duration limit"""
43
  # Save the uploaded audio to a temporary file
44
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
45
  tmp_file.write(audio_file.getvalue())
 
38
 
39
  return self.model
40
 
41
+ def segment_audio(self, audio_file, num_segments=5, min_segment_duration=3.0):
42
+ """Segment the audio file into chunks for processing with minimum 3-second and maximum 5-second duration"""
43
  # Save the uploaded audio to a temporary file
44
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
45
  tmp_file.write(audio_file.getvalue())