AustingDong
commited on
Commit
·
63b5fc2
1
Parent(s):
9cae7ef
align
Browse files- app.py +3 -3
- demo/visualization.py +27 -57
app.py
CHANGED
|
@@ -258,7 +258,7 @@ with gr.Blocks() as demo:
|
|
| 258 |
activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
|
| 259 |
|
| 260 |
with gr.Row():
|
| 261 |
-
|
| 262 |
understanding_output = gr.Textbox(label="Answer")
|
| 263 |
|
| 264 |
with gr.Row():
|
|
@@ -266,7 +266,7 @@ with gr.Blocks() as demo:
|
|
| 266 |
with gr.Column():
|
| 267 |
model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
|
| 268 |
test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
|
| 269 |
-
|
| 270 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
| 271 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
|
| 272 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
|
|
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
|
|
| 275 |
|
| 276 |
with gr.Column():
|
| 277 |
response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
|
| 278 |
-
focus = gr.Dropdown(choices=["question", "question + answer"], value="question
|
| 279 |
activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
|
| 280 |
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
| 281 |
visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
|
|
|
|
| 258 |
activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
|
| 259 |
|
| 260 |
with gr.Row():
|
| 261 |
+
question_input = gr.Textbox(label="Question")
|
| 262 |
understanding_output = gr.Textbox(label="Answer")
|
| 263 |
|
| 264 |
with gr.Row():
|
|
|
|
| 266 |
with gr.Column():
|
| 267 |
model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
|
| 268 |
test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
|
| 269 |
+
chart_type = gr.Textbox(label="Chart Type", value="Any")
|
| 270 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
| 271 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
|
| 272 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
|
|
|
|
| 275 |
|
| 276 |
with gr.Column():
|
| 277 |
response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
|
| 278 |
+
focus = gr.Dropdown(choices=["question", "question + answer"], value="question", label="focus")
|
| 279 |
activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
|
| 280 |
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
| 281 |
visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
|
demo/visualization.py
CHANGED
|
@@ -296,7 +296,7 @@ class VisualizationJanus(Visualization):
|
|
| 296 |
self._modify_layers()
|
| 297 |
self._register_hooks_activations()
|
| 298 |
|
| 299 |
-
def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
| 300 |
# Forward
|
| 301 |
image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
|
| 302 |
print(input_tensor.keys())
|
|
@@ -304,24 +304,18 @@ class VisualizationJanus(Visualization):
|
|
| 304 |
start_idx = 620
|
| 305 |
self.model.zero_grad()
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
| 311 |
-
loss.backward()
|
| 312 |
-
|
| 313 |
-
elif focus == "Language Model":
|
| 314 |
-
if target_token_idx == -1:
|
| 315 |
-
loss = outputs.logits.max(dim=-1).values.sum()
|
| 316 |
-
else:
|
| 317 |
-
loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
| 318 |
-
loss.backward()
|
| 319 |
-
|
| 320 |
-
self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
| 321 |
-
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 322 |
|
| 323 |
@spaces.GPU(duration=120)
|
| 324 |
-
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
| 325 |
|
| 326 |
self.setup_grads()
|
| 327 |
|
|
@@ -329,25 +323,14 @@ class VisualizationJanus(Visualization):
|
|
| 329 |
self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
| 330 |
|
| 331 |
start_idx = 620
|
| 332 |
-
if focus == "Visual Encoder":
|
| 333 |
-
|
| 334 |
-
cam_sum = self.grad_cam_vis()
|
| 335 |
-
cam_sum, grid_size = self.process(cam_sum)
|
| 336 |
-
return cam_sum, grid_size, start_idx
|
| 337 |
-
|
| 338 |
-
elif focus == "Language Model":
|
| 339 |
-
|
| 340 |
-
# cam_sum = self.grad_cam_llm(mean_inside=True)
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
# cam_sum_lst, grid_size = self.process_multiple(cam_sum, start_idx, images_seq_mask)
|
| 345 |
|
| 346 |
-
|
| 347 |
-
|
| 348 |
|
| 349 |
|
| 350 |
-
|
| 351 |
|
| 352 |
|
| 353 |
|
|
@@ -371,13 +354,14 @@ class VisualizationLLaVA(Visualization):
|
|
| 371 |
self.model.zero_grad()
|
| 372 |
print("outputs_raw", outputs_raw)
|
| 373 |
|
| 374 |
-
|
|
|
|
| 375 |
loss.backward()
|
| 376 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 377 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 378 |
|
| 379 |
@spaces.GPU(duration=120)
|
| 380 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
| 381 |
|
| 382 |
self.setup_grads()
|
| 383 |
self.forward_backward(inputs)
|
|
@@ -416,29 +400,23 @@ class VisualizationChartGemma(Visualization):
|
|
| 416 |
|
| 417 |
def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
|
| 418 |
outputs_raw = self.model(**inputs, output_hidden_states=True)
|
| 419 |
-
if focus == "
|
| 420 |
-
|
| 421 |
-
self.model.zero_grad()
|
| 422 |
-
|
| 423 |
-
loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
| 424 |
-
loss.backward()
|
| 425 |
-
|
| 426 |
-
elif focus == "Language Model":
|
| 427 |
self.model.zero_grad()
|
| 428 |
print("logits shape:", outputs_raw.logits.shape)
|
| 429 |
print("start_idx:", start_idx)
|
| 430 |
-
if target_token_idx == -1:
|
| 431 |
-
logits_prob = F.softmax(outputs_raw.logits, dim=-1)
|
| 432 |
-
loss = logits_prob.max(dim=-1).values.sum()
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
else:
|
| 435 |
-
loss =
|
| 436 |
loss.backward()
|
| 437 |
self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
| 438 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 439 |
|
| 440 |
@spaces.GPU(duration=120)
|
| 441 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
| 442 |
|
| 443 |
# Forward pass
|
| 444 |
self.setup_grads()
|
|
@@ -457,19 +435,11 @@ class VisualizationChartGemma(Visualization):
|
|
| 457 |
|
| 458 |
|
| 459 |
self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
|
| 460 |
-
if focus == "Visual Encoder":
|
| 461 |
-
|
| 462 |
-
cam_sum = self.grad_cam_vis()
|
| 463 |
-
cam_sum, grid_size = self.process(cam_sum, remove_cls=False)
|
| 464 |
-
|
| 465 |
-
return cam_sum, grid_size, start_idx
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
cams = self.attn_guided_cam()
|
| 470 |
-
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
| 471 |
|
| 472 |
-
|
| 473 |
|
| 474 |
|
| 475 |
|
|
|
|
| 296 |
self._modify_layers()
|
| 297 |
self._register_hooks_activations()
|
| 298 |
|
| 299 |
+
def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model"):
|
| 300 |
# Forward
|
| 301 |
image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
|
| 302 |
print(input_tensor.keys())
|
|
|
|
| 304 |
start_idx = 620
|
| 305 |
self.model.zero_grad()
|
| 306 |
|
| 307 |
+
logits = outputs.logits
|
| 308 |
+
if target_token_idx == -1:
|
| 309 |
+
loss = logits.max(dim=-1).values.sum()
|
| 310 |
+
else:
|
| 311 |
+
loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
| 312 |
+
loss.backward()
|
| 313 |
|
| 314 |
+
self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
| 315 |
+
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
@spaces.GPU(duration=120)
|
| 318 |
+
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
| 319 |
|
| 320 |
self.setup_grads()
|
| 321 |
|
|
|
|
| 323 |
self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
| 324 |
|
| 325 |
start_idx = 620
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
images_seq_mask = input_tensor.images_seq_mask[0].detach().cpu().tolist()
|
|
|
|
|
|
|
| 328 |
|
| 329 |
+
cams = self.attn_guided_cam()
|
| 330 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
| 331 |
|
| 332 |
|
| 333 |
+
return cam_sum_lst, grid_size, start_idx
|
| 334 |
|
| 335 |
|
| 336 |
|
|
|
|
| 354 |
self.model.zero_grad()
|
| 355 |
print("outputs_raw", outputs_raw)
|
| 356 |
|
| 357 |
+
logits = outputs_raw.logits
|
| 358 |
+
loss = logits.max(dim=-1).values.sum()
|
| 359 |
loss.backward()
|
| 360 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
| 361 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 362 |
|
| 363 |
@spaces.GPU(duration=120)
|
| 364 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
| 365 |
|
| 366 |
self.setup_grads()
|
| 367 |
self.forward_backward(inputs)
|
|
|
|
| 400 |
|
| 401 |
def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
|
| 402 |
outputs_raw = self.model(**inputs, output_hidden_states=True)
|
| 403 |
+
if focus == "Language Model":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
self.model.zero_grad()
|
| 405 |
print("logits shape:", outputs_raw.logits.shape)
|
| 406 |
print("start_idx:", start_idx)
|
|
|
|
|
|
|
|
|
|
| 407 |
|
| 408 |
+
logits = outputs_raw.logits
|
| 409 |
+
|
| 410 |
+
if target_token_idx == -1:
|
| 411 |
+
loss = logits.max(dim=-1).values.sum()
|
| 412 |
else:
|
| 413 |
+
loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
| 414 |
loss.backward()
|
| 415 |
self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
| 416 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 417 |
|
| 418 |
@spaces.GPU(duration=120)
|
| 419 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
| 420 |
|
| 421 |
# Forward pass
|
| 422 |
self.setup_grads()
|
|
|
|
| 435 |
|
| 436 |
|
| 437 |
self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
+
cams = self.attn_guided_cam()
|
| 440 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
|
|
|
|
|
|
| 441 |
|
| 442 |
+
# cams shape: [layers, 1, seq_len, seq_len]
|
| 443 |
|
| 444 |
|
| 445 |
|