Explainable-Vision-Language-Model

Running on Zero

App Files Files Community

khang119966 commited on Apr 13

Commit

888d672

verified ·

1 Parent(s): 6968b05

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -240

app.py CHANGED Viewed

@@ -35,13 +35,140 @@ from concurrent.futures import ProcessPoolExecutor
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-env = {'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}
-subprocess.run('apt-get install -y fonts-noto-cjk', env=env, shell=True)
-subprocess.run('apt-get update -y', env=env, shell=True)
-subprocess.run('apt-get install -y wkhtmltopdf', env=env, shell=True)
-subprocess.run('apt-get install -y xvfb', env=env, shell=True)
 torch.set_default_device('cuda')
@@ -181,113 +308,6 @@ def visualize_attention_hiddenstate(attention_tensor, head=None, start_img_token
     return heat_maps, top_5_tokens
-def generate_next_token_table_image(model, tokenizer, response, index_focus):
-    next_token_table = []
-    for layer_index in range(len(response.hidden_states[index_focus])):
-        h_out = model.language_model.lm_head(
-            model.language_model.model.norm(response.hidden_states[index_focus][layer_index][0])
-        )
-        h_out = torch.softmax(h_out, -1)
-        top_tokens = []
-        for token_index in h_out.argsort(descending=True)[0, :3]:  # Top 3
-            token_str = tokenizer.decode(token_index)
-            prob = float(h_out[0, int(token_index)])
-            top_tokens.append((token_str, prob))
-        next_token_table.append((layer_index, top_tokens))
-    next_token_table = next_token_table[::-1]
-    html_rows = ""
-    last_layer_index = len(next_token_table) - 1
-    for i, (layer_index, tokens) in enumerate(next_token_table):
-        row = f"<tr><td style='font-weight: bold'>Layer {layer_index}</td>"
-        # For the first column (Top 1)
-        token_str, prob = tokens[0]
-        # If this is the last layer in the table, make the text blue
-        if layer_index == last_layer_index:
-            row += f"<td><span style='color: red; font-weight: bold'>{token_str}</span> ({prob:.2%})</td>"
-        else:
-            row += f"<td><span style='color: blue; font-weight: bold'>{token_str}</span> ({prob:.2%})</td>"
-        # For the other columns, keep normal formatting
-        for token_str, prob in tokens[1:]:
-            row += f"<td>{token_str} ({prob:.2%})</td>"
-        row += "</tr>"
-        html_rows += row
-    html_code = f'''
-    <html>
-      <head>
-        <meta charset="utf-8">
-        <style>
-          table {{
-            font-family: 'Noto Sans';
-            font-size: 12px;
-            border-collapse: collapse;
-            table-layout: fixed;
-            width: 100%;
-          }}
-          th, td {{
-            border: 1px solid black;
-            padding: 8px;
-            width: 150px;
-            height: 30px;
-            overflow: hidden;
-            text-overflow: ellipsis;
-            white-space: nowrap;
-            text-align: center;
-          }}
-          th.layer {{
-            width: 100px;
-          }}
-          th.title {{
-            font-size: 14px;
-            padding: 10px;
-            height: auto;
-            white-space: normal;
-            overflow: visible;
-          }}
-        </style>
-      </head>
-      <body style="background-color: white;">
-        <table>
-          <tr>
-            <th colspan="4" class="title">
-              Top hidden tokens per layer for the Prediction
-            </th>
-          </tr>
-          <tr>
-            <th class="layer">Layer ⬆️</th>
-            <th>Top 1</th>
-            <th>Top 2</th>
-            <th>Top 3</th>
-          </tr>
-          {html_rows}
-        </table>
-      </body>
-    </html>
-    '''
-    with tempfile.TemporaryDirectory() as tmpdir:
-        hti = Html2Image(output_path=tmpdir)
-        hti.browser_flags = [
-    "--headless=new",      # ← Dùng chế độ headless mới
-    "--disable-gpu",       # ← Tắt GPU
-    "--disable-software-rasterizer",  # ← Tránh dùng fallback GPU software
-    "--no-sandbox",        # ← Tránh lỗi sandbox đa luồng
-]
-        filename = str(uuid.uuid4())+".png"
-        # filename = 'next_token_table.png'
-        hti.screenshot(html_str=html_code, save_as=filename, size=(500, 1000))
-        img_path = os.path.join(tmpdir, filename)
-        img_cv2 = cv2.imread(img_path)[:,:,::-1]
-        os.remove(img_path)
-    return img_cv2
 def adjust_overlay(overlay, text_img):
     h_o, w_o = overlay.shape[:2]
     h_t, w_t = text_img.shape[:2]
@@ -313,36 +333,6 @@ def adjust_overlay(overlay, text_img):
     return overlay_resized
-def generate_text_image_with_html2image(old_text, input_token, new_token, image_width=400, min_height=1000, font_size=16):
-    full_text = old_text + f"<span style='color:blue; font-weight:bold'>[{input_token}]</span>"+ "→" + f"<span style='color:red; font-weight:bold'>[{new_token}]</span>"
-    # Thay \n bằng thẻ HTML <br> để xuống dòng
-    full_text = full_text.replace('\n', '<br>')
-    html_code = f'''
-    <html>
-    <head>
-        <meta charset="utf-8">
-    </head>
-    <body style="font-family: 'DejaVu Sans', sans-serif; font-size: {font_size}px; width: {image_width}px; min-height: {min_height}px; padding: 10px; background-color: white; line-height: 1.4;">
-        {full_text}
-    </body>
-    </html>
-    '''
-    save_path = str(uuid.uuid4())+".png"
-    hti = Html2Image(output_path='.')
-    hti.browser_flags = [
-    "--headless=new",      # ← Dùng chế độ headless mới
-    "--disable-gpu",       # ← Tắt GPU
-    "--disable-software-rasterizer",  # ← Tránh dùng fallback GPU software
-    "--no-sandbox",        # ← Tránh lỗi sandbox đa luồng
-]
-    hti.screenshot(html_str=html_code, save_as=save_path, size=(image_width, min_height))
-    text_img = cv2.imread(save_path)
-    text_img = cv2.cvtColor(text_img, cv2.COLOR_BGR2RGB)
-    os.remove(save_path)
-    return text_img
 def extract_next_token_table_data(model, tokenizer, response, index_focus):
     next_token_table = []
     for layer_index in range(len(response.hidden_states[index_focus])):
@@ -359,98 +349,6 @@ def extract_next_token_table_data(model, tokenizer, response, index_focus):
     next_token_table = next_token_table[::-1]
     return next_token_table
-def render_next_token_table_image(table_data, predict_token):
-    import tempfile, uuid, os
-    from html2image import Html2Image
-    import cv2
-    html_rows = ""
-    last_layer_index = len(table_data)
-    for layer_index, tokens in table_data:
-        row = f"<tr><td style='font-weight: bold'>Layer {layer_index+1}</td>"
-        token_str, prob = tokens[0]
-        if token_str == predict_token:
-            style = "color: red; font-weight: bold"
-        else:
-            style = "color: blue; font-weight: bold"
-        row += f"<td><span style='{style}'>{token_str}</span> ({prob:.2%})</td>"
-        for token_str, prob in tokens[1:]:
-            row += f"<td>{token_str} ({prob:.2%})</td>"
-        row += "</tr>"
-        html_rows += row
-    html_code = f'''
-    <html>
-      <head>
-        <meta charset="utf-8">
-        <style>
-          table {{
-            font-family: 'Noto Sans';
-            font-size: 12px;
-            border-collapse: collapse;
-            table-layout: fixed;
-            width: 100%;
-          }}
-          th, td {{
-            border: 1px solid black;
-            padding: 8px;
-            width: 150px;
-            height: 30px;
-            overflow: hidden;
-            text-overflow: ellipsis;
-            white-space: nowrap;
-            text-align: center;
-          }}
-          th.layer {{
-            width: 100px;
-          }}
-          th.title {{
-            font-size: 14px;
-            padding: 10px;
-            height: auto;
-            white-space: normal;
-            overflow: visible;
-          }}
-        </style>
-      </head>
-      <body style="background-color: white;">
-        <table>
-          <tr>
-            <th colspan="4" class="title">
-              Hidden states per Transformer layer (LLM) for Prediction
-            </th>
-          </tr>
-          <tr>
-            <th class="layer">Layer ⬆️</th>
-            <th>Top 1</th>
-            <th>Top 2</th>
-            <th>Top 3</th>
-          </tr>
-          {html_rows}
-        </table>
-      </body>
-    </html>
-    '''
-    with tempfile.TemporaryDirectory() as tmpdir:
-        hti = Html2Image(output_path=tmpdir)
-        hti.browser_flags = [
-            "--headless=new",
-            "--disable-gpu",
-            "--disable-software-rasterizer",
-            "--no-sandbox",
-        ]
-        filename = str(uuid.uuid4()) + ".png"
-        hti.screenshot(html_str=html_code, save_as=filename, size=(500, 1000))
-        img_path = os.path.join(tmpdir, filename)
-        img_cv2 = cv2.imread(img_path)[:, :, ::-1]
-        os.remove(img_path)
-    return img_cv2
 model = AutoModel.from_pretrained(
     "khang119966/Vintern-1B-v3_5-explainableAI",
     torch_dtype=torch.bfloat16,
@@ -460,9 +358,8 @@ model = AutoModel.from_pretrained(
 ).eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
-# Hàm bao để truyền vào multiprocessing
 def generate_text_img_wrapper(args):
-    return generate_text_image_with_html2image(*args, image_width=500, min_height=1000)
 def generate_hidden_img_wrapper(args):
     return render_next_token_table_image(*args)
@@ -568,16 +465,21 @@ def generate_video(image, prompt, max_tokens):
     for frame in visualization_frames:
         frame = cv2.resize(frame,(visualization_frames[0].shape[1],visualization_frames[0].shape[0]))
         resized_visualization_frames.append(frame)
     # Lưu thành video MP4 bằng imageio
     imageio.mimsave(
-        'heatmap_animation.mp4',
         resized_visualization_frames,  # dạng RGB
         fps=5
     )
-    return "heatmap_animation.mp4"
 with gr.Blocks() as demo:
     gr.Markdown("""# 🎥 Visualizing How Multimodal Models Think

 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+from PIL import Image, ImageDraw, ImageFont
+import textwrap
+import uuid
+import os
+def generate_text_image_with_pil(old_text, input_token, new_token, image_width=400, min_height=1000, font_size=16):
+    import textwrap
+    import numpy as np
+    from PIL import Image, ImageDraw, ImageFont
+    # Split text by newlines first to preserve manual line breaks
+    paragraphs = old_text.split('\n')
+    # Add the token information to the last paragraph
+    if paragraphs:
+        paragraphs[-1] += f"[{input_token}]→[{new_token}]"
+    else:
+        paragraphs = [f"[{input_token}]→[{new_token}]"]
+    # Create a list to store all wrapped lines
+    all_lines = []
+    # Process each paragraph separately
+    for paragraph in paragraphs:
+        # Only wrap if paragraph is not empty
+        if paragraph.strip():
+            wrapped_lines = textwrap.wrap(paragraph, width=60)
+            all_lines.extend(wrapped_lines)
+        else:
+            # Add an empty line for empty paragraphs (newlines)
+            all_lines.append("")
+    # Create image
+    img = Image.new('RGB', (image_width, min_height), color='white')
+    draw = ImageDraw.Draw(img)
+    # Load font
+    font_path = "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc"
+    font = ImageFont.truetype(font_path, font_size)
+    # Draw text
+    y = 10
+    token_marker = f"[{input_token}]→[{new_token}]"
+    for line in all_lines:
+        if token_marker in line:
+            parts = line.split(token_marker)
+            # Draw text before token
+            draw.text((10, y), parts[0], fill="black", font=font)
+            x = 10 + draw.textlength(parts[0], font=font)
+            # Draw input token in blue
+            draw.text((x, y), f"[{input_token}]", fill="blue", font=font)
+            x += draw.textlength(f"[{input_token}]", font=font)
+            # Draw arrow
+            draw.text((x, y), "→", fill="black", font=font)
+            x += draw.textlength("→", font=font)
+            # Draw new token in red
+            draw.text((x, y), f"[{new_token}]", fill="red", font=font)
+            # Draw remainder text if any
+            if len(parts) > 1 and parts[1]:
+                x += draw.textlength(f"[{new_token}]", font=font)
+                draw.text((x, y), parts[1], fill="black", font=font)
+        else:
+            draw.text((10, y), line, fill="black", font=font)
+        # Move to next line, adding extra space between paragraphs
+        y += font_size + 8
+    return np.array(img)
+from PIL import Image, ImageDraw, ImageFont
+def render_next_token_table_image(table_data, predict_token, image_width=500, row_height=40, font_size=14):
+    # Cài đặt font hỗ trợ đa ngôn ngữ (sửa đường dẫn nếu cần)
+    # font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
+    font_path = "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc"
+    font = ImageFont.truetype(font_path, font_size)
+    num_rows = len(table_data) + 2  # +2 cho phần tiêu đề
+    num_cols = 4  # Layer | Top1 | Top2 | Top3
+    table_width = image_width
+    col_width = table_width // num_cols
+    table_height = num_rows * row_height
+    # Tạo ảnh trắng
+    img = Image.new("RGB", (table_width, table_height), "white")
+    draw = ImageDraw.Draw(img)
+    def draw_cell(x, y, text, color="black", bold=False):
+        if bold:
+            draw.text((x + 5, y + 5), text, font=font, fill=color)
+        else:
+            draw.text((x + 5, y + 5), text, font=font, fill=color)
+    # Vẽ hàng tiêu đề chính
+    draw.rectangle([0, 0, table_width, row_height], outline="black")
+    draw_cell(5, 5, "Hidden states per Transformer layer (LLM) for Prediction", bold=True)
+    # Vẽ tiêu đề cột
+    headers = ["Layer ⬆️", "Top 1", "Top 2", "Top 3"]
+    for col, header in enumerate(headers):
+        x0 = col * col_width
+        y0 = row_height
+        draw.rectangle([x0, y0, x0 + col_width, y0 + row_height], outline="black")
+        draw_cell(x0, y0, header, bold=True)
+    # Vẽ từng hàng layer
+    for i, (layer_index, tokens) in enumerate(table_data):
+        y = (i + 2) * row_height
+        for col in range(num_cols):
+            x = col * col_width
+            draw.rectangle([x, y, x + col_width, y + row_height], outline="black")
+            if col == 0:
+                draw_cell(x, y, f"Layer {layer_index+1}", bold=True)
+            else:
+                if col - 1 < len(tokens):
+                    token_str, prob = tokens[col - 1]
+                    # Thay \n bằng chuỗi "\\n"
+                    token_str = token_str
+                    color = "red" if token_str == predict_token and col == 1 else "blue" if col == 1 else "black"
+                    bold = token_str == predict_token and col == 1
+                    token_str_ = token_str.replace("\n", "\\n").replace(" ", "\\s").replace("\t", "\\t")
+                    draw_cell(x, y, f"{token_str_} ({prob:.1%})", color=color, bold=bold)
+    return np.array(img)
 torch.set_default_device('cuda')
     return heat_maps, top_5_tokens
 def adjust_overlay(overlay, text_img):
     h_o, w_o = overlay.shape[:2]
     h_t, w_t = text_img.shape[:2]
     return overlay_resized
 def extract_next_token_table_data(model, tokenizer, response, index_focus):
     next_token_table = []
     for layer_index in range(len(response.hidden_states[index_focus])):
     next_token_table = next_token_table[::-1]
     return next_token_table
 model = AutoModel.from_pretrained(
     "khang119966/Vintern-1B-v3_5-explainableAI",
     torch_dtype=torch.bfloat16,
 ).eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
 def generate_text_img_wrapper(args):
+    return generate_text_image_with_pil(*args, image_width=500, min_height=1000)
 def generate_hidden_img_wrapper(args):
     return render_next_token_table_image(*args)
     for frame in visualization_frames:
         frame = cv2.resize(frame,(visualization_frames[0].shape[1],visualization_frames[0].shape[0]))
         resized_visualization_frames.append(frame)
     # Lưu thành video MP4 bằng imageio
     imageio.mimsave(
+        'heatmap_with_music.mp4',
         resized_visualization_frames,  # dạng RGB
         fps=5
     )
+    # Nối video và nhạc
+    video = VideoFileClip("heatmap_animation.mp4")
+    audio = AudioFileClip("legacy-of-the-century-background-cinematic-music-for-video-46-second-319542.mp3").set_duration(video.duration)
+    final = video.set_audio(audio)
+    final.write_videofile("heatmap_with_music.mp4", codec="libx264", audio_codec="aac")
+    return "heatmap_with_music.mp4"
 with gr.Blocks() as demo:
     gr.Markdown("""# 🎥 Visualizing How Multimodal Models Think