Spaces:

vidore
/

vidore-leaderboard

Running

App Files Files Community

new_benchmark_2

by QuentinJG - opened Mar 17

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+282

-81

Files changed (4) hide show

app.py +162 -43
app/utils.py +25 -13
data/dataset_handler.py +35 -0
data/model_handler.py +60 -25

app.py CHANGED Viewed

@@ -3,18 +3,36 @@ import gradio as gr
 from app.utils import add_rank_and_format, filter_models, get_refresh_function
 from data.model_handler import ModelHandler
-METRICS = ["ndcg_at_5", "recall_at_1"]
 def main():
     model_handler = ModelHandler()
     initial_metric = "ndcg_at_5"
-    data = model_handler.get_vidore_data(initial_metric)
-    data = add_rank_and_format(data)
-    NUM_DATASETS = len(data.columns) - 3
-    NUM_SCORES = len(data) * NUM_DATASETS
-    NUM_MODELS = len(data)
     css = """
     table > thead {
@@ -41,65 +59,167 @@ def main():
     with gr.Blocks(css=css) as block:
         with gr.Tabs():
-            with gr.TabItem("🏆 Leaderboard"):
-                gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 📚🔍")
-                gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
                 gr.Markdown(
                     """
-                Visual Document Retrieval Benchmark leaderboard. To submit results, refer to the corresponding tab.
-                Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
                 """
                 )
-                datasets_columns = list(data.columns[3:])
-                anchor_columns = list(data.columns[:3])
-                default_columns = anchor_columns + datasets_columns
                 with gr.Row():
-                    metric_dropdown = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
-                    research_textbox = gr.Textbox(placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", )
-                    column_checkboxes = gr.CheckboxGroup(choices=datasets_columns, value=default_columns, label="Select Columns to Display")
                 with gr.Row():
-                    datatype = ["number", "markdown"] + ["number"] * (NUM_DATASETS + 1)
-                    dataframe = gr.Dataframe(data, datatype=datatype, type="pandas")
-                def update_data(metric, search_term, selected_columns):
-                    data = model_handler.get_vidore_data(metric)
-                    data = add_rank_and_format(data)
                     data = filter_models(data, search_term)
                     if selected_columns:
-                        selected_columns = selected_columns
-                        data = data[selected_columns]
                     return data
                 with gr.Row():
-                    refresh_button = gr.Button("Refresh")
-                    refresh_button.click(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe, concurrency_limit=20)
                 # Automatically refresh the dataframe when the dropdown value changes
-                metric_dropdown.change(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe)
-                research_textbox.submit(
-                    lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
-                    inputs=[metric_dropdown, research_textbox, column_checkboxes],
-                    outputs=dataframe
                 )
-                column_checkboxes.change(
-                    lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
-                    inputs=[metric_dropdown, research_textbox, column_checkboxes],
-                    outputs=dataframe
                 )
-                #column_checkboxes.change(get_refresh_function(), inputs=[metric_dropdown, column_checkboxes], outputs=dataframe)
                 gr.Markdown(
                     f"""
-                - **Total Datasets**: {NUM_DATASETS}
-                - **Total Scores**: {NUM_SCORES}
-                - **Total Models**: {NUM_MODELS}
                 """
                     + r"""
                 Please consider citing:
@@ -143,8 +263,8 @@ def main():
                             },
                         }
                         ```
-                        - The dataset names should be the same as the ViDoRe dataset names listed in the following
-                        collection: [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).
                     3. **Submit your model**:
                         - Create a public HuggingFace model repository with your model.
@@ -162,6 +282,5 @@ def main():
     block.queue(max_size=10).launch(debug=True)
-if __name__ == "__main__":
     main()

 from app.utils import add_rank_and_format, filter_models, get_refresh_function
 from data.model_handler import ModelHandler
+METRICS = [
+    "ndcg_at_1",
+    "ndcg_at_5",
+    "ndcg_at_10",
+    "ndcg_at_100",
+    "recall_at_1",
+    "recall_at_5",
+    "recall_at_10",
+    "recall_at_100",
+]
 def main():
     model_handler = ModelHandler()
     initial_metric = "ndcg_at_5"
+    model_handler.get_vidore_data(initial_metric)
+    data_benchmark_1 = model_handler.compute_averages(initial_metric, benchmark_version=1)
+    data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
+    data_benchmark_2 = model_handler.compute_averages(initial_metric, benchmark_version=2)
+    data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
+    NUM_DATASETS_1 = len(data_benchmark_1.columns) - 3
+    NUM_SCORES_1 = len(data_benchmark_1) * NUM_DATASETS_1
+    NUM_MODELS_1 = len(data_benchmark_1)
+    NUM_DATASETS_2 = len(data_benchmark_2.columns) - 3
+    NUM_SCORES_2 = len(data_benchmark_2) * NUM_DATASETS_2
+    NUM_MODELS_2 = len(data_benchmark_2)
     css = """
     table > thead {
     with gr.Blocks(css=css) as block:
         with gr.Tabs():
+            with gr.TabItem("🏆 Leaderboard Benchmark 2"):
+                gr.Markdown("# ViDoRe 2: A new visual Document Retrieval Benchmark 📚🔍")
+                gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
                 gr.Markdown(
                     """
+                Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
+                Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
                 """
                 )
+                datasets_columns_2 = list(data_benchmark_2.columns[3:])
                 with gr.Row():
+                    metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
+                    research_textbox_2 = gr.Textbox(
+                        placeholder="🔍 Search Models... [press enter]",
+                        label="Filter Models by Name",
+                    )
+                    column_checkboxes_2 = gr.CheckboxGroup(
+                        choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display"
+                    )
                 with gr.Row():
+                    datatype_2 = ["number", "markdown"] + ["number"] * (NUM_DATASETS_2 + 1)
+                    dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas")
+                def update_data_2(metric, search_term, selected_columns):
+                    model_handler.get_vidore_data(metric)
+                    data = model_handler.compute_averages(metric, benchmark_version=2)
+                    data = add_rank_and_format(data, benchmark_version=2)
                     data = filter_models(data, search_term)
+                    # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Average"] + selected_columns]
                     return data
                 with gr.Row():
+                    refresh_button_2 = gr.Button("Refresh")
+                    refresh_button_2.click(
+                        get_refresh_function(model_handler, benchmark_version=2),
+                        inputs=[metric_dropdown_2],
+                        outputs=dataframe_2,
+                        concurrency_limit=20,
+                    )
+                with gr.Row():
+                    gr.Markdown(
+                        """
+                    **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
+                    Those numbers are not numbers obtained from the organisations that released those models.
+                    """
+                    )
                 # Automatically refresh the dataframe when the dropdown value changes
+                metric_dropdown_2.change(
+                    get_refresh_function(model_handler, benchmark_version=2),
+                    inputs=[metric_dropdown_2],
+                    outputs=dataframe_2,
+                )
+                research_textbox_2.submit(
+                    lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
+                    outputs=dataframe_2,
+                )
+                column_checkboxes_2.change(
+                    lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
+                    outputs=dataframe_2,
+                )
+                gr.Markdown(
+                    f"""
+                - **Total Datasets**: {NUM_DATASETS_2}
+                - **Total Scores**: {NUM_SCORES_2}
+                - **Total Models**: {NUM_MODELS_2}
+                """
+                    + r"""
+                Please consider citing:
+                ```bibtex
+                @misc{faysse2024colpaliefficientdocumentretrieval,
+                  title={ColPali: Efficient Document Retrieval with Vision Language Models},
+                  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
+                  year={2024},
+                  eprint={2407.01449},
+                  archivePrefix={arXiv},
+                  primaryClass={cs.IR},
+                  url={https://arxiv.org/abs/2407.01449},
+                }
+                ```
+                """
                 )
+            with gr.TabItem("🏆 Leaderboard Benchmark 1"):
+                gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
+                gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
+                gr.Markdown(
+                    """
+                Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
+                Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
+                """
                 )
+                datasets_columns_1 = list(data_benchmark_1.columns[3:])
+                with gr.Row():
+                    metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
+                    research_textbox_1 = gr.Textbox(
+                        placeholder="🔍 Search Models... [press enter]",
+                        label="Filter Models by Name",
+                    )
+                    column_checkboxes_1 = gr.CheckboxGroup(
+                        choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
+                    )
+                with gr.Row():
+                    datatype_1 = ["number", "markdown"] + ["number"] * (NUM_DATASETS_1 + 1)
+                    dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
+                def update_data_1(metric, search_term, selected_columns):
+                    model_handler.get_vidore_data(metric)
+                    data = model_handler.compute_averages(metric, benchmark_version=1)
+                    data = add_rank_and_format(data, benchmark_version=1)
+                    data = filter_models(data, search_term)
+                    # data = remove_duplicates(data)  # Add this line
+                    if selected_columns:
+                        data = data[["Rank", "Model", "Average"] + selected_columns]
+                    return data
+                with gr.Row():
+                    refresh_button_1 = gr.Button("Refresh")
+                    refresh_button_1.click(
+                        get_refresh_function(model_handler, benchmark_version=1),
+                        inputs=[metric_dropdown_1],
+                        outputs=dataframe_1,
+                        concurrency_limit=20,
+                    )
+                # Automatically refresh the dataframe when the dropdown value changes
+                metric_dropdown_1.change(
+                    get_refresh_function(model_handler, benchmark_version=1),
+                    inputs=[metric_dropdown_1],
+                    outputs=dataframe_1,
+                )
+                research_textbox_1.submit(
+                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
+                    outputs=dataframe_1,
+                )
+                column_checkboxes_1.change(
+                    lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
+                    inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
+                    outputs=dataframe_1,
+                )
                 gr.Markdown(
                     f"""
+                - **Total Datasets**: {NUM_DATASETS_1}
+                - **Total Scores**: {NUM_SCORES_1}
+                - **Total Models**: {NUM_MODELS_1}
                 """
                     + r"""
                 Please consider citing:
                             },
                         }
                         ```
+                        - The dataset names should be the same as the ViDoRe and ViDoRe 2 dataset names listed in the following
+                        collections: [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d) and [ViDoRe Benchmark 2](vidore/vidore-benchmark-v2-dev-67ae03e3924e85b36e7f53b0).
                     3. **Submit your model**:
                         - Create a public HuggingFace model repository with your model.
     block.queue(max_size=10).launch(debug=True)
+if __name__ == "__main__":
     main()

app/utils.py CHANGED Viewed

@@ -1,31 +1,43 @@
 from data.model_handler import ModelHandler
 def make_clickable_model(model_name, link=None):
     if link is None:
         desanitized_model_name = model_name.replace("_", "/")
-        if '/captioning' in desanitized_model_name:
-            desanitized_model_name = desanitized_model_name.replace('/captioning', '')
-        if '/ocr' in desanitized_model_name:
-            desanitized_model_name = desanitized_model_name.replace('/ocr', '')
         link = "https://huggingface.co/" + desanitized_model_name
-    return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
-def add_rank_and_format(df):
     df = df.reset_index()
     df = df.rename(columns={"index": "Model"})
-    df = ModelHandler.add_rank(df)
     df["Model"] = df["Model"].apply(make_clickable_model)
     return df
-def get_refresh_function():
     def _refresh(metric):
-        model_handler = ModelHandler()
-        data_task_category = model_handler.get_vidore_data(metric)
-        df = add_rank_and_format(data_task_category)
         return df
     return _refresh
@@ -33,5 +45,5 @@ def get_refresh_function():
 def filter_models(data, search_term):
     if search_term:
-        data = data[data['Model'].str.contains(search_term, case=False, na=False)]
-    return data

 from data.model_handler import ModelHandler
 def make_clickable_model(model_name, link=None):
     if link is None:
         desanitized_model_name = model_name.replace("_", "/")
+        desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
+        if "/captioning" in desanitized_model_name:
+            desanitized_model_name = desanitized_model_name.replace("/captioning", "")
+        if "/ocr" in desanitized_model_name:
+            desanitized_model_name = desanitized_model_name.replace("/ocr", "")
         link = "https://huggingface.co/" + desanitized_model_name
+    return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
+def add_rank_and_format(df, benchmark_version=1):
     df = df.reset_index()
     df = df.rename(columns={"index": "Model"})
+    df = ModelHandler.add_rank(df, benchmark_version)
     df["Model"] = df["Model"].apply(make_clickable_model)
+    # df = remove_duplicates(df)
     return df
+def remove_duplicates(df):
+    """Remove duplicate models based on their name (after the last '/' if present)."""
+    df["model_name"] = df["Model"].str.replace("_", "/")
+    df = df.sort_values("Rank").drop_duplicates(subset=["model_name"], keep="first")
+    df = df.drop("model_name", axis=1)
+    return df
+def get_refresh_function(model_handler, benchmark_version):
     def _refresh(metric):
+        model_handler.get_vidore_data(metric)
+        data_task_category = model_handler.compute_averages(metric, benchmark_version)
+        df = add_rank_and_format(data_task_category, benchmark_version)
         return df
     return _refresh
 def filter_models(data, search_term):
     if search_term:
+        data = data[data["Model"].str.contains(search_term, case=False, na=False)]
+    return data

data/dataset_handler.py CHANGED Viewed

@@ -11,6 +11,14 @@ VIDORE_DATASETS_KEYWORDS = [
     "healthcare_industry",
 ]
 def get_datasets_nickname(dataset_name) -> str:
     if "arxivqa" in dataset_name:
@@ -41,5 +49,32 @@ def get_datasets_nickname(dataset_name) -> str:
     elif "healthcare_industry" in dataset_name:
         return "Healthcare Industry"
     else:
         raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")

     "healthcare_industry",
 ]
+VIDORE_2_DATASETS_KEYWORDS = [
+    "restaurant_esg",
+    "rse_restaurant",
+    "axa",
+    "mit_biomedical",
+    "economics_macro",
+]
 def get_datasets_nickname(dataset_name) -> str:
     if "arxivqa" in dataset_name:
     elif "healthcare_industry" in dataset_name:
         return "Healthcare Industry"
+    elif "restaurant_esg" in dataset_name:
+        return "ESG Restaurant Human"
+    elif "rse_restaurant" in dataset_name and "multilingual" in dataset_name:
+        return "ESG Restaurant Synthetic Multilingual"
+    elif "rse_restaurant" in dataset_name:
+        return "ESG Restaurant Synthetic"
+    elif "axa" in dataset_name and "multilingual" in dataset_name:
+        return "AXA Multilingual"
+    elif "axa" in dataset_name:
+        return "AXA"
+    elif "mit_biomedical" in dataset_name and "multilingual" in dataset_name:
+        return "MIT Biomedical Multilingual"
+    elif "mit_biomedical" in dataset_name:
+        return "MIT Biomedical"
+    elif "economics_macro" in dataset_name and "multilingual" in dataset_name:
+        return "Economics Macro Multilingual"
+    elif "economics_macro" in dataset_name:
+        return "Economics Macro"
     else:
         raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")

data/model_handler.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Dict
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, metadata_load
-from .dataset_handler import VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
 BLOCKLIST = ["impactframes"]
@@ -29,15 +29,30 @@ class ModelHandler:
     def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
         return "metadata" in results and "metrics" in results
     def get_vidore_data(self, metric="ndcg_at_5"):
         models = self.api.list_models(filter="vidore")
         repositories = [model.modelId for model in models]  # type: ignore
         for repo_id in repositories:
             org_name = repo_id.split("/")[0]
             if org_name in BLOCKLIST:
                 continue
             files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
             if len(files) == 0:
@@ -45,39 +60,58 @@ class ModelHandler:
             else:
                 for file in files:
                     if file.endswith("results.json"):
-                        model_name = repo_id.replace("/", "_")
                     else:
                         model_name = file.split("_metrics.json")[0]
-                    if model_name not in self.model_infos:
-                        readme_path = hf_hub_download(repo_id, filename="README.md")
-                        meta = metadata_load(readme_path)
-                        try:
-                            result_path = hf_hub_download(repo_id, filename=file)
-                            with open(result_path) as f:
-                                results = json.load(f)
-                            if self._are_results_in_new_vidore_format(results):
-                                metadata = results["metadata"]
-                                results = results["metrics"]
-                            self.model_infos[model_name] = {"meta": meta, "results": results}
-                        except Exception as e:
-                            print(f"Error loading {model_name} - {e}")
-                            continue
-        # self._save_model_infos()
         model_res = {}
-        if len(self.model_infos) > 0:
-            for model in self.model_infos.keys():
-                res = self.model_infos[model]["results"]
                 dataset_res = {}
                 for dataset in res.keys():
-                    # for each keyword check if it is in the dataset name if not continue
-                    if not any(keyword in dataset for keyword in VIDORE_DATASETS_KEYWORDS):
-                        print(f"{dataset} not found in ViDoRe datasets. Skipping ...")
                         continue
                     dataset_nickname = get_datasets_nickname(dataset)
@@ -90,7 +124,7 @@ class ModelHandler:
         return pd.DataFrame()
     @staticmethod
-    def add_rank(df):
         df.fillna(0.0, inplace=True)
         cols_to_rank = [
             col
@@ -104,6 +138,7 @@ class ModelHandler:
                 "Max Tokens",
             ]
         ]
         if len(cols_to_rank) == 1:
             df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
         else:

 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download, metadata_load
+from .dataset_handler import VIDORE_2_DATASETS_KEYWORDS, VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
 BLOCKLIST = ["impactframes"]
     def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
         return "metadata" in results and "metrics" in results
+    def _is_baseline_repo(self, repo_id: str) -> bool:
+        return repo_id == "vidore/baseline-results"
+    def sanitize_model_name(self, model_name):
+        return model_name.replace("/", "_").replace(".", "-thisisapoint-")
+    def fuze_model_infos(self, model_name, results):
+        for dataset, metrics in results.items():
+            if dataset not in self.model_infos[model_name]["results"].keys():
+                self.model_infos[model_name]["results"][dataset] = metrics
+            else:
+                continue
     def get_vidore_data(self, metric="ndcg_at_5"):
         models = self.api.list_models(filter="vidore")
         repositories = [model.modelId for model in models]  # type: ignore
+        # Sort repositories to process non-baseline repos first (to prioritize their results)
+        repositories.sort(key=lambda x: self._is_baseline_repo(x))
         for repo_id in repositories:
             org_name = repo_id.split("/")[0]
             if org_name in BLOCKLIST:
                 continue
             files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
             if len(files) == 0:
             else:
                 for file in files:
                     if file.endswith("results.json"):
+                        model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
                     else:
                         model_name = file.split("_metrics.json")[0]
+                        model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
+                    # Skip if the model is from baseline and we already have results
+                    readme_path = hf_hub_download(repo_id, filename="README.md")
+                    meta = metadata_load(readme_path)
+                    try:
+                        result_path = hf_hub_download(repo_id, filename=file)
+                        with open(result_path) as f:
+                            results = json.load(f)
+                        if self._are_results_in_new_vidore_format(results):
+                            metadata = results["metadata"]
+                            results = results["metrics"]
+                        # Handles the case where the model is both in baseline and outside of it
+                        # (prioritizes the non-baseline results)
+                        if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
+                            self.fuze_model_infos(model_name, results)
+                        self.model_infos[model_name] = {"meta": meta, "results": results}
+                    except Exception as e:
+                        print(f"Error loading {model_name} - {e}")
+                        continue
+    # In order to keep only models relevant to a benchmark
+    def filter_models_by_benchmark(self, benchmark_version=1):
+        filtered_model_infos = {}
+        keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
+        for model, info in self.model_infos.items():
+            results = info["results"]
+            if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
+                filtered_model_infos[model] = info
+        return filtered_model_infos
+    # Compute the average of a metric for each model,
+    def compute_averages(self, metric="ndcg_at_5", benchmark_version=1):
         model_res = {}
+        filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
+        if len(filtered_model_infos) > 0:
+            for model in filtered_model_infos.keys():
+                res = filtered_model_infos[model]["results"]
                 dataset_res = {}
+                keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
                 for dataset in res.keys():
+                    if not any(keyword in dataset for keyword in keywords):
                         continue
                     dataset_nickname = get_datasets_nickname(dataset)
         return pd.DataFrame()
     @staticmethod
+    def add_rank(df, benchmark_version=1):
         df.fillna(0.0, inplace=True)
         cols_to_rank = [
             col
                 "Max Tokens",
             ]
         ]
         if len(cols_to_rank) == 1:
             df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
         else: