John Ho commited on
Commit
b3db9ce
·
1 Parent(s): f18bd0f

testing more efficient model loading

Browse files
Files changed (1) hide show
  1. app.py +3 -1
app.py CHANGED
@@ -24,7 +24,7 @@ subprocess.run(
24
  # For maximum memory efficiency, use bfloat16 if your GPU supports it, otherwise float16.
25
  DTYPE = (
26
  torch.bfloat16
27
- if torch.cuda.is_available() and torch.cuda.is_bfloat16_supported()
28
  else torch.float16
29
  )
30
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -114,6 +114,8 @@ def inference(
114
  messages, return_video_kwargs=True
115
  )
116
 
 
 
117
  with torch.no_grad():
118
  inputs = processor(
119
  text=[text],
 
24
  # For maximum memory efficiency, use bfloat16 if your GPU supports it, otherwise float16.
25
  DTYPE = (
26
  torch.bfloat16
27
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
28
  else torch.float16
29
  )
30
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
114
  messages, return_video_kwargs=True
115
  )
116
 
117
+ # This prevents PyTorch from building the computation graph for gradients,
118
+ # saving a significant amount of memory for intermediate activations.
119
  with torch.no_grad():
120
  inputs = processor(
121
  text=[text],