Spaces:

SaylorTwift
/

OpenEvalsDetails

Running

App Files Files Community

Linker1907 commited on Apr 7

Commit

22f9e0d

1 Parent(s): 6639773

init

Browse files

Files changed (1) hide show

app.py +222 -0

app.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from datasets import load_dataset
+import json
+import gradio as gr
+# Hardcoded list of subsets from experiments.json
+BENCHMARKS = [
+    "custom|gpqa:diamond|0",
+    "custom|aime24|0",
+    "custom|aime25|0",
+    "extended|ifeval|0"
+]
+from datasets import get_dataset_split_names
+# Add this near the top with other constants
+REPO_OPTIONS = [
+    "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private",
+"OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
+    # Add more common repositories as needed
+]
+def get_available_splits(repo, benchmark):
+    return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_"))
+def load_details_and_results(repo, subset, split):
+    def worker(example):
+        example["predictions"] = example["predictions"]
+        example["gold"] = example["gold"][0]
+        example["metrics"] = example["metrics"]
+        return example
+    details = load_dataset(repo, subset.replace("|", "_").replace(":", "_"), split=split)
+    results = load_dataset(repo, "results", split=split)
+    results = eval(results[0]["results"])
+    columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
+    details = details.select_columns(columns_to_keep)
+    details = details.map(worker)
+    return details, results
+def update_splits(repo, benchmark):
+    splits = get_available_splits(repo, benchmark)
+    return gr.Dropdown(choices=splits, value=splits[0] if splits else None)
+def display_model_details(repo_name, benchmark, split, example_index):
+    try:
+        # Load details for the specific model, benchmark and split
+        details, _ = load_details_and_results(repo_name, benchmark, split)
+        example = details[example_index]
+    except Exception as e:
+        return f"Error loading model details: {str(e)}"
+    # Create HTML output
+    html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
+    # Ground Truth section
+    html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
+    html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
+    html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
+    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{example['gold']}</code></pre>\n"
+    html_output += "</div>\n"
+    html_output += "</div>\n"
+    # Model output section
+    html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
+    html_output += f"<h2 style='margin-top: 0;'>{repo_name}</h2>\n"
+    html_output += f"<p style='color: #666;'>Split: {split}</p>\n"
+    # Prompt section
+    html_output += "<details style='margin-bottom: 15px;'>\n"
+    html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
+    html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
+    html_output += "<div style='overflow-x: auto;'>\n"
+    prompt = example['full_prompt']
+    if isinstance(prompt, list):
+        for msg in prompt:
+            if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
+                role = msg['role'].title()
+                content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
+                html_output += f"<div style='margin-bottom: 10px;'>\n"
+                html_output += f"<strong>{role}:</strong>\n"
+                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
+                html_output += "</div>\n"
+            else:
+                content = str(msg).replace('<', '&lt;').replace('>', '&gt;')
+                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
+    else:
+        prompt_text = str(prompt).replace('<', '&lt;').replace('>', '&gt;')
+        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{prompt_text}</code></pre>\n"
+    html_output += "</div>\n"
+    html_output += "</div>\n"
+    html_output += "</details>\n\n"
+    # Metrics section
+    html_output += "<details open style='margin-bottom: 15px;'>\n"
+    html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
+    metrics = example['metrics']
+    if isinstance(metrics, str):
+        metrics = eval(metrics)
+    html_output += "<div style='overflow-x: auto;'>\n"
+    html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
+    for key, value in metrics.items():
+        if isinstance(value, float):
+            value = f"{value:.3f}"
+        html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
+    html_output += "</table>\n"
+    html_output += "</div>\n"
+    html_output += "</details>\n\n"
+    # Prediction section
+    prediction = example['predictions'][0] if example['predictions'] else ''
+    html_output += "<details open style='margin-bottom: 15px;'>\n"
+    html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
+    word_count = len(prediction.split())
+    html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
+    html_output += "</summary>\n"
+    html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
+    html_output += "<div style='overflow-x: auto;'>\n"
+    prediction = prediction.replace('<', '&lt;').replace('>', '&gt;')
+    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
+    html_output += "</div>\n"
+    html_output += "</div>\n"
+    html_output += "</details>\n"
+    html_output += "</div>\n</div>"
+    return html_output
+# Create the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Model Generation Details")
+    gr.Markdown("View detailed outputs for a specific model")
+    with gr.Row():
+        repo_select = gr.Radio(
+            choices=["Choose from list", "Custom"],
+            label="Repository Selection Method",
+            value="Choose from list",
+            info="Select how you want to specify the repository"
+        )
+    with gr.Row():
+        repo_dropdown = gr.Dropdown(
+            choices=REPO_OPTIONS,
+            label="Repository Name",
+            value=REPO_OPTIONS[0] if REPO_OPTIONS else None,
+            visible=True,
+            info="Select from predefined repositories"
+        )
+        repo_custom = gr.Textbox(
+            label="Custom Repository Name",
+            placeholder="e.g. OpenEvals/details_custom_model_private",
+            visible=False,
+            info="Enter custom repository name"
+        )
+    with gr.Row():
+        benchmark = gr.Dropdown(
+            label="Benchmark",
+            choices=BENCHMARKS,
+            value=BENCHMARKS[0],
+            info="Select the benchmark subset"
+        )
+        split = gr.Dropdown(
+            label="Split",
+            choices=[],
+            info="Select the evaluation split"
+        )
+    with gr.Row():
+        example_index = gr.Number(
+            label="Example Index",
+            value=0,
+            step=1,
+            info="Navigate through different examples"
+        )
+        submit_btn = gr.Button("Show Results", variant="primary")
+    # Add this function to handle visibility toggling
+    def toggle_repo_input(choice):
+        return {
+            repo_dropdown: gr.update(visible=(choice == "Choose from list")),
+            repo_custom: gr.update(visible=(choice == "Custom"))
+        }
+    # Add this function to get the active repository name
+    def get_active_repo(selection_method, dropdown_value, custom_value):
+        return custom_value if selection_method == "Custom" else dropdown_value
+    # Update the event handlers
+    repo_select.change(
+        fn=toggle_repo_input,
+        inputs=[repo_select],
+        outputs=[repo_dropdown, repo_custom]
+    )
+    # Update the benchmark change handler
+    benchmark.change(
+        fn=lambda selection_method, dropdown, custom, bench: update_splits(
+            get_active_repo(selection_method, dropdown, custom),
+            bench
+        ),
+        inputs=[repo_select, repo_dropdown, repo_custom, benchmark],
+        outputs=split
+    )
+    # Display results
+    output = gr.HTML()
+    submit_btn.click(
+        fn=lambda selection_method, dropdown, custom, bench, split_val, idx: display_model_details(
+            get_active_repo(selection_method, dropdown, custom),
+            bench,
+            split_val,
+            idx
+        ),
+        inputs=[repo_select, repo_dropdown, repo_custom, benchmark, split, example_index],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()