Spaces:
Running
Running
| from datasets import load_dataset | |
| import json | |
| import gradio as gr | |
| # Load experiments.json to get model configurations | |
| with open('experiments.json', 'r') as f: | |
| EXPERIMENTS = json.load(f) | |
| # Get all unique benchmark subsets from experiments.json | |
| BENCHMARKS = [] | |
| for model_config in EXPERIMENTS.values(): | |
| for benchmark in model_config['benchmarks'].values(): | |
| subset = benchmark['subset'] | |
| if subset not in BENCHMARKS: | |
| BENCHMARKS.append(subset) | |
| from datasets import get_dataset_split_names | |
| # Add this near the top with other constants | |
| REPO_OPTIONS = [ | |
| "OpenEvals/details_gpt-4o_private", | |
| "OpenEvals/details_claude-3-7-sonnet-20250219_private", | |
| "OpenEvals/details_o3-mini-2025-01-31_private", | |
| "OpenEvals/details_moonshotai__Moonlight-16B-A3B-Instruct_private", | |
| "OpenEvals/details_meta-llama__Llama-3.3-70B-Instruct_private", | |
| "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Llama-70B_private", | |
| "OpenEvals/details_qihoo360__TinyR1-32B-Preview_private", | |
| "OpenEvals/details_openai__gpt-4.5-preview-2025-02-27_private", | |
| "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Qwen-32B_private", | |
| "OpenEvals/details_openai__deepseek-ai__DeepSeek-R1_private", | |
| "OpenEvals/details_Qwen__QwQ-32B_private", | |
| "OpenEvals/details_google__gemma-3-1b-it_private", | |
| "OpenEvals/details_google__gemma-3-12b-it_private", | |
| "OpenEvals/details_google__gemma-3-27b-it_private", | |
| "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3-0324_private", | |
| "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3_private", | |
| "OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private", | |
| "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private" | |
| ] | |
| def get_model_name_from_repo(repo): | |
| # Extract model name from repository path | |
| # Example: "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private" | |
| # -> "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" | |
| parts = repo.split('/') | |
| model_name = parts[1].replace('details_', '').replace('_private', '') | |
| # Convert double underscores back to forward slashes | |
| model_name = model_name.replace('__', '/') | |
| return model_name | |
| def get_available_benchmarks(repo): | |
| model_name = get_model_name_from_repo(repo) | |
| print(model_name) | |
| if not model_name or model_name not in EXPERIMENTS: | |
| return [] | |
| model_config = EXPERIMENTS[model_name] | |
| print(model_config) | |
| return [benchmark['subset'] for benchmark in model_config['benchmarks'].values()] | |
| def get_available_splits(repo, benchmark): | |
| if not benchmark: | |
| return [] | |
| return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_")) | |
| def load_details_and_results(repo, subset, split): | |
| def worker(example): | |
| example["predictions"] = example["predictions"] | |
| example["gold"] = example["gold"] | |
| example["metrics"] = example["metrics"] | |
| return example | |
| details = load_dataset(repo, subset.replace("|", "_").replace(":", "_"), split=split) | |
| results = load_dataset(repo, "results", split=split) | |
| results = eval(results[0]["results"]) | |
| columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions'] | |
| details = details.select_columns(columns_to_keep) | |
| details = details.map(worker) | |
| return details, results | |
| def update_splits(repo, benchmark): | |
| splits = get_available_splits(repo, benchmark) | |
| return gr.Dropdown(choices=splits, value=splits[0] if splits else None) | |
| def display_model_details(repo_name, benchmark, split, example_index): | |
| try: | |
| # Load details for the specific model, benchmark and split | |
| details, _ = load_details_and_results(repo_name, benchmark, split) | |
| example = details[example_index] | |
| except Exception as e: | |
| return f"Error loading model details: {str(e)}" | |
| # Create HTML output | |
| html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n" | |
| # Ground Truth section | |
| html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n" | |
| html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n" | |
| html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n" | |
| html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{example['gold']}</code></pre>\n" | |
| html_output += "</div>\n" | |
| html_output += "</div>\n" | |
| # Model output section | |
| html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n" | |
| html_output += f"<h2 style='margin-top: 0;'>{repo_name}</h2>\n" | |
| html_output += f"<p style='color: #666;'>Split: {split}</p>\n" | |
| # Prompt section | |
| html_output += "<details style='margin-bottom: 15px;'>\n" | |
| html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n" | |
| html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n" | |
| html_output += "<div style='overflow-x: auto;'>\n" | |
| prompt = example['full_prompt'] | |
| if isinstance(prompt, list): | |
| for msg in prompt: | |
| if isinstance(msg, dict) and 'role' in msg and 'content' in msg: | |
| role = msg['role'].title() | |
| content = msg['content'].replace('<', '<').replace('>', '>') | |
| html_output += f"<div style='margin-bottom: 10px;'>\n" | |
| html_output += f"<strong>{role}:</strong>\n" | |
| html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n" | |
| html_output += "</div>\n" | |
| else: | |
| content = str(msg).replace('<', '<').replace('>', '>') | |
| html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n" | |
| else: | |
| prompt_text = str(prompt).replace('<', '<').replace('>', '>') | |
| html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{prompt_text}</code></pre>\n" | |
| html_output += "</div>\n" | |
| html_output += "</div>\n" | |
| html_output += "</details>\n\n" | |
| # Metrics section | |
| html_output += "<details open style='margin-bottom: 15px;'>\n" | |
| html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n" | |
| metrics = example['metrics'] | |
| if isinstance(metrics, str): | |
| metrics = eval(metrics) | |
| html_output += "<div style='overflow-x: auto;'>\n" | |
| html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n" | |
| for key, value in metrics.items(): | |
| if isinstance(value, float): | |
| value = f"{value:.3f}" | |
| html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n" | |
| html_output += "</table>\n" | |
| html_output += "</div>\n" | |
| html_output += "</details>\n\n" | |
| # Prediction section | |
| prediction = example['predictions'][0] if example['predictions'] else '' | |
| html_output += "<details open style='margin-bottom: 15px;'>\n" | |
| html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>" | |
| word_count = len(prediction.split()) | |
| html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>" | |
| html_output += "</summary>\n" | |
| html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n" | |
| html_output += "<div style='overflow-x: auto;'>\n" | |
| prediction = prediction.replace('<', '<').replace('>', '>') | |
| html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n" | |
| html_output += "</div>\n" | |
| html_output += "</div>\n" | |
| html_output += "</details>\n" | |
| html_output += "</div>\n</div>" | |
| return html_output | |
| # Create the Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Model Generation Details") | |
| gr.Markdown("View detailed outputs for a specific model") | |
| with gr.Row(): | |
| repo_select = gr.Radio( | |
| choices=["Choose from list", "Custom"], | |
| label="Repository Selection Method", | |
| value="Choose from list", | |
| info="Select how you want to specify the repository" | |
| ) | |
| with gr.Row(): | |
| repo_dropdown = gr.Dropdown( | |
| choices=REPO_OPTIONS, | |
| label="Repository Name", | |
| value=REPO_OPTIONS[0] if REPO_OPTIONS else None, | |
| visible=True, | |
| info="Select from predefined repositories" | |
| ) | |
| repo_custom = gr.Textbox( | |
| label="Custom Repository Name", | |
| placeholder="e.g. OpenEvals/details_custom_model_private", | |
| visible=False, | |
| info="Enter custom repository name" | |
| ) | |
| with gr.Row(): | |
| benchmark_select = gr.Radio( | |
| choices=["Choose from list", "Custom"], | |
| label="Benchmark Selection Method", | |
| value="Choose from list", | |
| info="Select how you want to specify the benchmark" | |
| ) | |
| with gr.Row(): | |
| benchmark_dropdown = gr.Dropdown( | |
| label="Benchmark", | |
| choices=[], | |
| info="Select the benchmark subset", | |
| visible=True | |
| ) | |
| benchmark_custom = gr.Textbox( | |
| label="Custom Benchmark", | |
| placeholder="e.g. lighteval|gpqa:diamond|0", | |
| visible=False, | |
| info="Enter custom benchmark name" | |
| ) | |
| split = gr.Dropdown( | |
| label="Split", | |
| choices=[], | |
| info="Select evaluation." | |
| ) | |
| load_splits_btn = gr.Button("Load Splits", variant="secondary") | |
| with gr.Row(): | |
| example_index = gr.Number( | |
| label="Example Index", | |
| value=0, | |
| step=1, | |
| info="Navigate through different examples" | |
| ) | |
| submit_btn = gr.Button("Show Results", variant="primary") | |
| # Add this function to handle visibility toggling | |
| def toggle_repo_input(choice): | |
| return { | |
| repo_dropdown: gr.update(visible=(choice == "Choose from list")), | |
| repo_custom: gr.update(visible=(choice == "Custom")) | |
| } | |
| # Add this function to get the active repository name | |
| def get_active_repo(selection_method, dropdown_value, custom_value): | |
| return custom_value if selection_method == "Custom" else dropdown_value | |
| # Add this function to handle benchmark visibility toggling | |
| def toggle_benchmark_input(choice): | |
| return { | |
| benchmark_dropdown: gr.update(visible=(choice == "Choose from list")), | |
| benchmark_custom: gr.update(visible=(choice == "Custom")) | |
| } | |
| # Add this function to get the active benchmark name | |
| def get_active_benchmark(selection_method, dropdown_value, custom_value): | |
| return custom_value if selection_method == "Custom" else dropdown_value | |
| # Update the event handlers | |
| repo_select.change( | |
| fn=toggle_repo_input, | |
| inputs=[repo_select], | |
| outputs=[repo_dropdown, repo_custom] | |
| ) | |
| benchmark_select.change( | |
| fn=toggle_benchmark_input, | |
| inputs=[benchmark_select], | |
| outputs=[benchmark_dropdown, benchmark_custom] | |
| ) | |
| # Update the repository change handler to update available benchmarks | |
| def update_benchmarks(selection_method, dropdown_value, custom_value): | |
| repo = get_active_repo(selection_method, dropdown_value, custom_value) | |
| available_benchmarks = get_available_benchmarks(repo) | |
| print(available_benchmarks) | |
| return gr.Dropdown(choices=available_benchmarks, value=available_benchmarks[0] if available_benchmarks else None) | |
| repo_dropdown.change( | |
| fn=update_benchmarks, | |
| inputs=[repo_select, repo_dropdown, repo_custom], | |
| outputs=benchmark_dropdown | |
| ) | |
| repo_custom.change( | |
| fn=update_benchmarks, | |
| inputs=[repo_select, repo_dropdown, repo_custom], | |
| outputs=benchmark_dropdown | |
| ) | |
| # Update the benchmark change handler | |
| benchmark_dropdown.change( | |
| fn=lambda selection_method, dropdown, custom, bench: gr.Dropdown(choices=[], value=None), | |
| inputs=[repo_select, repo_dropdown, repo_custom, benchmark_dropdown], | |
| outputs=split | |
| ) | |
| benchmark_custom.change( | |
| fn=lambda selection_method, dropdown, custom, bench: gr.Dropdown(choices=[], value=None), | |
| inputs=[repo_select, repo_dropdown, repo_custom, benchmark_custom], | |
| outputs=split | |
| ) | |
| # Add handler for the load splits button | |
| load_splits_btn.click( | |
| fn=lambda selection_method, dropdown, custom, bench_selection_method, bench_dropdown, bench_custom: update_splits( | |
| get_active_repo(selection_method, dropdown, custom), | |
| get_active_benchmark(bench_selection_method, bench_dropdown, bench_custom) | |
| ), | |
| inputs=[repo_select, repo_dropdown, repo_custom, benchmark_select, benchmark_dropdown, benchmark_custom], | |
| outputs=split | |
| ) | |
| # Display results | |
| output = gr.HTML() | |
| submit_btn.click( | |
| fn=lambda repo_selection_method, repo_dropdown, repo_custom, bench_selection_method, bench_dropdown, bench_custom, split_val, idx: display_model_details( | |
| get_active_repo(repo_selection_method, repo_dropdown, repo_custom), | |
| get_active_benchmark(bench_selection_method, bench_dropdown, bench_custom), | |
| split_val, | |
| idx | |
| ), | |
| inputs=[repo_select, repo_dropdown, repo_custom, benchmark_select, benchmark_dropdown, benchmark_custom, split, example_index], | |
| outputs=output | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |