Spaces:
Running
on
Zero
Running
on
Zero
dung-vpt-uney
commited on
Commit
·
f3839cb
1
Parent(s):
21b5285
Update Visual-CoT demo - 2025-10-12 23:59:41
Browse filesFixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script
app.py
CHANGED
|
@@ -71,59 +71,53 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 71 |
# Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
|
| 72 |
# https://huggingface.co/collections/tuandunghcmut/visual-chain-of-thought-reasoning-benchmarks
|
| 73 |
BENCHMARK_DATASETS = {
|
| 74 |
-
"Visual-CoT": {
|
| 75 |
-
"path": "deepcs233/Visual-CoT",
|
| 76 |
-
"config": None,
|
| 77 |
-
"split": "train",
|
| 78 |
-
"description": "Main Visual-CoT dataset with 438K question-answer pairs",
|
| 79 |
-
},
|
| 80 |
"GQA": {
|
| 81 |
"path": "lmms-lab/GQA",
|
| 82 |
"config": "train_balanced_images",
|
| 83 |
"split": "train",
|
| 84 |
-
"description": "Scene graph
|
| 85 |
},
|
| 86 |
"RefCOCO": {
|
| 87 |
"path": "lmms-lab/RefCOCO",
|
| 88 |
-
"config":
|
| 89 |
-
"split": "
|
| 90 |
-
"description": "Referring expression comprehension (
|
| 91 |
},
|
| 92 |
"RefCOCO+": {
|
| 93 |
"path": "lmms-lab/RefCOCOplus",
|
| 94 |
-
"config":
|
| 95 |
-
"split": "
|
| 96 |
-
"description": "RefCOCO with no location words (
|
| 97 |
},
|
| 98 |
"RefCOCOg": {
|
| 99 |
"path": "lmms-lab/RefCOCOg",
|
| 100 |
-
"config":
|
| 101 |
-
"split": "
|
| 102 |
-
"description": "RefCOCO with longer expressions (
|
| 103 |
},
|
| 104 |
"POPE": {
|
| 105 |
"path": "lmms-lab/POPE",
|
| 106 |
-
"config":
|
| 107 |
"split": "test",
|
| 108 |
-
"description": "
|
| 109 |
},
|
| 110 |
"ScienceQA": {
|
| 111 |
"path": "lmms-lab/ScienceQA",
|
| 112 |
-
"config":
|
| 113 |
-
"split": "
|
| 114 |
-
"description": "Science question answering (
|
| 115 |
},
|
| 116 |
"MM-GCoT": {
|
| 117 |
"path": "AQUA6/MM-GCoT",
|
| 118 |
-
"config":
|
| 119 |
"split": "train",
|
| 120 |
-
"description": "Multi-Modal Graph
|
| 121 |
},
|
| 122 |
"VGR": {
|
| 123 |
"path": "BytedanceDouyinContent/VGR",
|
| 124 |
-
"config":
|
| 125 |
"split": "train",
|
| 126 |
-
"description": "Visual Grounding & Reasoning (90K
|
| 127 |
},
|
| 128 |
}
|
| 129 |
|
|
@@ -224,7 +218,7 @@ def load_benchmark_example(dataset_name, index=0):
|
|
| 224 |
|
| 225 |
# Load dataset with config and split
|
| 226 |
print(f"Loading {dataset_name} from {dataset_path} (config={dataset_config}, split={dataset_split})...")
|
| 227 |
-
if dataset_config:
|
| 228 |
dataset = load_dataset(dataset_path, dataset_config, split=dataset_split, streaming=True)
|
| 229 |
else:
|
| 230 |
dataset = load_dataset(dataset_path, split=dataset_split, streaming=True)
|
|
@@ -262,6 +256,13 @@ def load_benchmark_example(dataset_name, index=0):
|
|
| 262 |
traceback.print_exc()
|
| 263 |
return None, error_msg, "", "", error_msg
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# =============================================================================
|
| 266 |
# Utility Functions
|
| 267 |
# =============================================================================
|
|
@@ -650,6 +651,16 @@ def create_demo():
|
|
| 650 |
|
| 651 |
submit_btn = gr.Button("Run Analysis", variant="primary", size="lg")
|
| 652 |
clear_btn = gr.Button("Clear", size="sm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
|
| 654 |
with gr.Column(scale=1):
|
| 655 |
# Output
|
|
@@ -732,6 +743,12 @@ def create_demo():
|
|
| 732 |
fn=lambda: (None, "", "", "", None, ""),
|
| 733 |
outputs=[image_input, question_input, bbox_output, answer_output, image_output, info_output],
|
| 734 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
|
| 736 |
# ============================================================
|
| 737 |
# Tab 2: Benchmark Explorer
|
|
|
|
| 71 |
# Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
|
| 72 |
# https://huggingface.co/collections/tuandunghcmut/visual-chain-of-thought-reasoning-benchmarks
|
| 73 |
BENCHMARK_DATASETS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"GQA": {
|
| 75 |
"path": "lmms-lab/GQA",
|
| 76 |
"config": "train_balanced_images",
|
| 77 |
"split": "train",
|
| 78 |
+
"description": "Scene graph QA (72K balanced images)",
|
| 79 |
},
|
| 80 |
"RefCOCO": {
|
| 81 |
"path": "lmms-lab/RefCOCO",
|
| 82 |
+
"config": "default",
|
| 83 |
+
"split": "val",
|
| 84 |
+
"description": "Referring expression comprehension (8.8K validation)",
|
| 85 |
},
|
| 86 |
"RefCOCO+": {
|
| 87 |
"path": "lmms-lab/RefCOCOplus",
|
| 88 |
+
"config": "default",
|
| 89 |
+
"split": "val",
|
| 90 |
+
"description": "RefCOCO with no location words (3.8K validation)",
|
| 91 |
},
|
| 92 |
"RefCOCOg": {
|
| 93 |
"path": "lmms-lab/RefCOCOg",
|
| 94 |
+
"config": "default",
|
| 95 |
+
"split": "val",
|
| 96 |
+
"description": "RefCOCO with longer expressions (7.5K validation)",
|
| 97 |
},
|
| 98 |
"POPE": {
|
| 99 |
"path": "lmms-lab/POPE",
|
| 100 |
+
"config": "default",
|
| 101 |
"split": "test",
|
| 102 |
+
"description": "Object probing evaluation (9K test)",
|
| 103 |
},
|
| 104 |
"ScienceQA": {
|
| 105 |
"path": "lmms-lab/ScienceQA",
|
| 106 |
+
"config": "ScienceQA-FULL",
|
| 107 |
+
"split": "validation",
|
| 108 |
+
"description": "Science question answering (4.2K validation)",
|
| 109 |
},
|
| 110 |
"MM-GCoT": {
|
| 111 |
"path": "AQUA6/MM-GCoT",
|
| 112 |
+
"config": "train",
|
| 113 |
"split": "train",
|
| 114 |
+
"description": "Multi-Modal Graph CoT (63.9K training)",
|
| 115 |
},
|
| 116 |
"VGR": {
|
| 117 |
"path": "BytedanceDouyinContent/VGR",
|
| 118 |
+
"config": "default",
|
| 119 |
"split": "train",
|
| 120 |
+
"description": "Visual Grounding & Reasoning (90K training)",
|
| 121 |
},
|
| 122 |
}
|
| 123 |
|
|
|
|
| 218 |
|
| 219 |
# Load dataset with config and split
|
| 220 |
print(f"Loading {dataset_name} from {dataset_path} (config={dataset_config}, split={dataset_split})...")
|
| 221 |
+
if dataset_config and dataset_config != "None":
|
| 222 |
dataset = load_dataset(dataset_path, dataset_config, split=dataset_split, streaming=True)
|
| 223 |
else:
|
| 224 |
dataset = load_dataset(dataset_path, split=dataset_split, streaming=True)
|
|
|
|
| 256 |
traceback.print_exc()
|
| 257 |
return None, error_msg, "", "", error_msg
|
| 258 |
|
| 259 |
+
def load_random_benchmark_example(dataset_name):
|
| 260 |
+
"""Load a random example from benchmark for inference"""
|
| 261 |
+
import random
|
| 262 |
+
# Use random index between 0-99 for faster loading
|
| 263 |
+
random_index = random.randint(0, 99)
|
| 264 |
+
return load_benchmark_example(dataset_name, random_index)
|
| 265 |
+
|
| 266 |
# =============================================================================
|
| 267 |
# Utility Functions
|
| 268 |
# =============================================================================
|
|
|
|
| 651 |
|
| 652 |
submit_btn = gr.Button("Run Analysis", variant="primary", size="lg")
|
| 653 |
clear_btn = gr.Button("Clear", size="sm")
|
| 654 |
+
|
| 655 |
+
gr.Markdown("---")
|
| 656 |
+
gr.Markdown("**Load Random Benchmark Example:**")
|
| 657 |
+
benchmark_select = gr.Dropdown(
|
| 658 |
+
choices=list(BENCHMARK_DATASETS.keys()),
|
| 659 |
+
value="GQA",
|
| 660 |
+
label="Select Benchmark",
|
| 661 |
+
scale=1,
|
| 662 |
+
)
|
| 663 |
+
load_random_btn = gr.Button("🎲 Load Random Example", variant="secondary")
|
| 664 |
|
| 665 |
with gr.Column(scale=1):
|
| 666 |
# Output
|
|
|
|
| 743 |
fn=lambda: (None, "", "", "", None, ""),
|
| 744 |
outputs=[image_input, question_input, bbox_output, answer_output, image_output, info_output],
|
| 745 |
)
|
| 746 |
+
|
| 747 |
+
load_random_btn.click(
|
| 748 |
+
fn=load_random_benchmark_example,
|
| 749 |
+
inputs=[benchmark_select],
|
| 750 |
+
outputs=[image_input, question_input, bbox_output, answer_output, info_output],
|
| 751 |
+
)
|
| 752 |
|
| 753 |
# ============================================================
|
| 754 |
# Tab 2: Benchmark Explorer
|