arjunanand13 commited on
Commit
3f6e172
·
verified ·
1 Parent(s): bb1a482

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import torch
3
+ from PIL import Image
4
+ import numpy as np
5
+ import os
6
+ import shutil
7
+ import gradio as gr
8
+ from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
12
+
13
+ quantization_config = BitsAndBytesConfig(
14
+ load_in_4bit=True,
15
+ bnb_4bit_compute_dtype=torch.float16,
16
+ bnb_4bit_use_double_quant=True,
17
+ bnb_4bit_quant_type="nf4"
18
+ )
19
+
20
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
21
+ model_id,
22
+ quantization_config=quantization_config,
23
+ low_cpu_mem_usage=True,
24
+ device_map="auto"
25
+ )
26
+
27
+ processor = LlavaNextVideoProcessor.from_pretrained(model_id)
28
+
29
+ def sample_frames(video_path, num_frames):
30
+ output_dir = "/tmp/processed_frames"
31
+
32
+ if os.path.exists(output_dir):
33
+ shutil.rmtree(output_dir)
34
+ os.makedirs(output_dir)
35
+
36
+ video = cv2.VideoCapture(video_path)
37
+
38
+ if not video.isOpened():
39
+ raise ValueError(f"Could not open video file: {video_path}")
40
+
41
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
42
+ interval = max(1, total_frames // num_frames)
43
+ frames = []
44
+ frame_count = 0
45
+
46
+ for i in range(total_frames):
47
+ ret, frame = video.read()
48
+ if not ret:
49
+ continue
50
+ if i % interval == 0 and len(frames) < num_frames:
51
+ cv2.imwrite(f"{output_dir}/frame_{frame_count:03d}.jpg", frame)
52
+ pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
53
+ frames.append(pil_img)
54
+ frame_count += 1
55
+
56
+ video.release()
57
+
58
+ frame_paths = [f"{output_dir}/frame_{i:03d}.jpg" for i in range(frame_count)]
59
+
60
+ return frames, frame_paths
61
+
62
+ def analyze_video(video_path):
63
+ conversation = [
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "text", "text": "Analyze this gas pipe quality control video. Answer these two questions with True/False: 1) DIPPED IN WATER: Was the pipe dipped in water for testing? Look for pipe being submerged in water container. 2) BUBBLES AFTER IMMERSION: After the pipe was fully immersed (ignore initial displacement bubbles), were there any bubbles indicating a leak? Format: DIPPED IN WATER: True/False, BUBBLES AFTER IMMERSION: True/False"},
68
+ {"type": "video"},
69
+ ],
70
+ },
71
+ ]
72
+
73
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
74
+
75
+ video_frames, frame_paths = sample_frames(video_path, 20)
76
+
77
+ inputs = processor(text=prompt, videos=video_frames, padding=True)
78
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
79
+
80
+ output = model.generate(
81
+ **inputs,
82
+ max_new_tokens=100,
83
+ do_sample=True,
84
+ temperature=0.3,
85
+ top_p=0.9,
86
+ top_k=50,
87
+ repetition_penalty=1.1,
88
+ pad_token_id=processor.tokenizer.eos_token_id
89
+ )
90
+
91
+ result = processor.decode(output[0][2:], skip_special_tokens=True)
92
+
93
+ return frame_paths, result
94
+
95
+ examples = [
96
+ ["07.mp4"],
97
+ ["09.mp4"],
98
+ ["29.mp4"]
99
+ ]
100
+
101
+ iface = gr.Interface(
102
+ fn=analyze_video,
103
+ inputs=gr.Video(),
104
+ outputs=[
105
+ gr.Gallery(label="Processed Frames"),
106
+ gr.Textbox(label="LLM Analysis", lines=10)
107
+ ],
108
+ title="Gas Pipe Quality Control Analyzer",
109
+ examples=examples,
110
+ cache_examples=False
111
+ )
112
+
113
+ iface.launch(share=True)