Spaces:
Runtime error
Runtime error
| import csv | |
| import gc | |
| import io | |
| import logging | |
| import os | |
| import re | |
| import traceback | |
| from collections import Counter | |
| from time import time | |
| from typing import Any, Dict, List | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import psutil | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from supertoken import MistralTokenizer, TikTokenTokenizer, TokenMonsterTokenizer | |
| os.environ["HF_HUB_CACHE"] = f"/scratch/{os.environ.get('USER')}/.cache/huggingface/hub" | |
| def get_memory_usage(): | |
| """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)""" | |
| # System RAM | |
| vm = psutil.virtual_memory() | |
| ram_used_mb = vm.used / (1024**2) | |
| ram_total_mb = vm.total / (1024**2) | |
| # GPU memory | |
| if torch.cuda.is_available(): | |
| gpu_idx = torch.cuda.current_device() | |
| torch.cuda.synchronize() | |
| gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024**2) | |
| gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024**2) | |
| gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / ( | |
| 1024**2 | |
| ) | |
| gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved) # safe estimate | |
| else: | |
| gpu_mem_used = 0 | |
| gpu_mem_total = 0 | |
| return gpu_mem_used, gpu_mem_total, ram_used_mb, ram_total_mb | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Model configurations - maps display names to HF model paths | |
| PREDEFINED_MODELS = [ | |
| "meta-llama/Llama-3.2-1B", | |
| "meta-llama/Llama-3.2-1B-Instruct", | |
| "meta-llama/Meta-Llama-3-8B-Instruct", | |
| "meta-llama/Meta-Llama-3-8B", | |
| "google/gemma-2-2b", | |
| "google/gemma-2-2b-it", | |
| "google/gemma-3-1b-pt", | |
| "Qwen/Qwen3-0.6B", | |
| "Qwen/Qwen2.5-0.5B", | |
| "Qwen/Qwen2.5-1.5B", | |
| # "Qwen/Qwen2.5-1.5B-Instruct", | |
| "bigscience/bloom-560m", | |
| "CohereForAI/aya-expanse-8b", | |
| "common-pile/comma-v0.1-2t", | |
| "google/byt5-small", | |
| # "gsaltintas/supertoken_models-llama_gpt2", | |
| # "gsaltintas/supertoken_models-llama_google-gemma-2-2b", | |
| # "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b", | |
| "r-three/supertoken_models-llama_google-gemma-2-2b", | |
| "r-three/supertoken_models-llama_common-pile-comma-v0.1", | |
| "r-three/supertoken_models-llama_meta-llama-Llama-3.2-1B", | |
| "r-three/supertoken_models-llama_microsoft-Phi-3-mini-4k-instruct", | |
| "r-three/supertoken_models-llama_gpt2", | |
| "r-three/supertoken_models-llama_bigscience-bloom", | |
| "r-three/supertoken_models-llama_facebook-xglm-564M", | |
| ] | |
| INDUSTRY_MODELS = [ | |
| "meta-llama/Llama-3.2-1B", | |
| "meta-llama/Llama-3.2-1B-Instruct", | |
| "meta-llama/Meta-Llama-3-8B-Instruct", | |
| "meta-llama/Meta-Llama-3-8B", | |
| "google/gemma-2-2b", | |
| "google/gemma-2-2b-it", | |
| "google/gemma-3-1b-pt", | |
| "Qwen/Qwen3-0.6B", | |
| "Qwen/Qwen2.5-0.5B", | |
| "Qwen/Qwen2.5-1.5B", | |
| # "Qwen/Qwen2.5-1.5B-Instruct", | |
| "bigscience/bloom-560m", | |
| "CohereForAI/aya-expanse-8b", | |
| "common-pile/comma-v0.1-2t", | |
| "google/byt5-small", | |
| ] | |
| TOKSUITE_MODELS = [ | |
| # "gsaltintas/supertoken_models-llama_gpt2", | |
| # "gsaltintas/supertoken_models-llama_google-gemma-2-2b", | |
| # "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b", | |
| "google-gemma-2-2b", | |
| "common-pile-comma-v0.1", | |
| "meta-llama-Llama-3.2-1B", | |
| "microsoft-Phi-3-mini-4k-instruct", | |
| "gpt2", | |
| "bigscience-bloom", | |
| "facebook-xglm-564M", | |
| "mistralai-tekken", | |
| "tokenmonster-englishcode-32000-consistent-v1", | |
| "google-byt5-small", | |
| "google-bert-bert-base-multilingual-cased", | |
| "Qwen-Qwen3-8B", | |
| "tiktoken-gpt-4o", | |
| ] | |
| # Global cache for loaded models | |
| model_cache = dict() | |
| print(os.environ.get("HF_HUB_CACHE")) | |
| def normalize_delimiter(delim: str) -> str: | |
| delim = delim.strip() | |
| if delim == "\\t": # user typed literal \t | |
| return "\t" | |
| if len(delim) != 1: | |
| raise ValueError(f"Delimiter must be a single character, got {repr(delim)}") | |
| return delim | |
| def parse_dataset(text, delimiter: str = "\t"): | |
| """Parse the input dataset text into structured questions""" | |
| delimiter = normalize_delimiter(delimiter) | |
| def clean_cell(s: str) -> str: | |
| return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip() | |
| if not text.strip(): | |
| return [], "Please enter your dataset" | |
| # Normalize line endings | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| # Use csv.reader to handle quoted multi-line cells | |
| reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"') | |
| questions = [] | |
| errors = [] | |
| for i, row in enumerate(reader, 1): | |
| # skip empty rows | |
| if not any(cell.strip() for cell in row): | |
| continue | |
| parts = [clean_cell(p) for p in row] | |
| if len(parts) < 5: | |
| errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})") | |
| continue | |
| question = { | |
| "question": parts[0], | |
| "correct_answer": parts[1], | |
| "choices": [parts[2], parts[3], parts[4]], | |
| } | |
| if question["correct_answer"] not in question["choices"]: | |
| question["choices"].append(question["correct_answer"]) | |
| questions.append(question) | |
| error_msg = "\n".join(errors) if errors else "" | |
| return questions, error_msg | |
| def setup_tokenizer(model_path): | |
| tokenizer_name = model_path | |
| if "supertoken" in model_path: | |
| import json | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| files = list_repo_files(model_path) | |
| tokenizer = None | |
| if "tokenizer_config.json" in files: | |
| tokenizer_path = hf_hub_download( | |
| repo_id=model_path, filename="tokenizer_config.json" | |
| ) | |
| with open(tokenizer_path) as f: | |
| tok_config = json.load(f)["data"]["tokenizer"] | |
| tokenizer_name = tok_config["path"] | |
| typ = tok_config["name"] | |
| logger.info(f"Loading tokenizer config: {tok_config}") | |
| # tokenizer = build_tokenizer(typ, tokenizer_name).tokenizer | |
| if tok_config["name"] == "tiktoken": | |
| tokenizer = TikTokenTokenizer.load(tokenizer_name) | |
| logger.info(f"Using TikToken tokenizer for {tokenizer_name}") | |
| elif tok_config["name"] == "tokenmonster": | |
| logger.info(f"Using TokenMonster tokenizer for {tokenizer_name}") | |
| tokenizer = TokenMonsterTokenizer.load(tokenizer_name) | |
| elif tok_config["name"] == "tekken": | |
| logger.info(f"Using Mistral tokenizer for {tokenizer_name}") | |
| tokenizer = MistralTokenizer.load(tokenizer_name) | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| tokenizer_name, trust_remote_code=True, legacy=True | |
| ) | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| tokenizer_name, trust_remote_code=True, legacy=True | |
| ) | |
| return tokenizer | |
| def load_model_and_tokenizer(model_path, progress_callback=None): | |
| """Load model and tokenizer with caching""" | |
| global model_cache | |
| # Decide caching strategy based on memory usage | |
| gpu_used, gpu_total, ram_used, ram_total = get_memory_usage() | |
| logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB") | |
| logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB") | |
| use_cache = ( | |
| not ( | |
| (gpu_total > 0 and gpu_used / gpu_total > 0.8) | |
| or (ram_used / ram_total > 0.8) | |
| ) | |
| or model_path in model_cache | |
| ) | |
| if not use_cache: | |
| logger.warning("High memory usage detected β disabling model cache.") | |
| if use_cache and model_path in model_cache: | |
| logger.info(f"Using cached model: {model_path}") | |
| if progress_callback: | |
| progress_callback(1.0, f"β Using cached model: {model_path}") | |
| return model_cache[model_path] | |
| try: | |
| if progress_callback: | |
| progress_callback(0.1, f"π Starting to load model: {model_path}") | |
| # Check if CUDA is available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Loading model: {model_path} using device: {device}") | |
| if progress_callback: | |
| progress_callback(0.2, f"π₯ Loading tokenizer for {model_path}...") | |
| # Load tokenizer | |
| tokenizer = setup_tokenizer(model_path) | |
| # # Add pad token if missing | |
| # if tokenizer.pad_token is None: | |
| # tokenizer.pad_token = tokenizer.eos_token | |
| if progress_callback: | |
| progress_callback( | |
| 0.5, | |
| f"π§ Loading model weights for {model_path}... (this may take a while)", | |
| ) | |
| # Load model with appropriate settings | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| device_map="auto" if device == "cuda" else None, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| model_info = {"tokenizer": tokenizer, "model": model, "device": device} | |
| if use_cache: | |
| model_cache[model_path] = model_info | |
| if progress_callback: | |
| progress_callback(1.0, f"β Successfully loaded model: {model_path}") | |
| return model_info | |
| except Exception as e: | |
| import code | |
| error_msg = f"β Error loading model {model_path}: {str(e)}" | |
| logger.error(error_msg) | |
| traceback.print_exc() | |
| # code.interact(local=dict(globals(), **locals())) | |
| if progress_callback: | |
| progress_callback(0.0, error_msg) | |
| return None | |
| def calculate_choice_likelihood( | |
| model, tokenizer, question, choice, normalization_method: str = "token-length" | |
| ): | |
| """Calculate the log-likelihood of the choice given the question prompt""" | |
| try: | |
| prompt = f"Question: {question}\nAnswer: " | |
| prompt = question | |
| full_text = f"{prompt} {choice}" | |
| # Tokenize full input (prompt + answer) | |
| input_ids = tokenizer.encode( | |
| full_text, return_tensors="pt", add_special_tokens=False | |
| ).to(model.device) | |
| prompt_ids = tokenizer.encode( | |
| prompt, return_tensors="pt", add_special_tokens=False | |
| ).to(model.device) | |
| if input_ids.size(1) <= prompt_ids.size(1): | |
| logger.warning("Answer tokens are empty after tokenization.") | |
| return float("-inf") | |
| with torch.no_grad(): | |
| outputs = model(input_ids) | |
| logits = outputs.logits | |
| # Get logits for the answer tokens only | |
| answer_len = input_ids.size(1) - prompt_ids.size(1) | |
| target_ids = input_ids[:, -answer_len:] | |
| logits = logits[ | |
| :, prompt_ids.size(1) - 1 : -1, : | |
| ] # shifted for next-token prediction | |
| log_probs = torch.nn.functional.log_softmax(logits, dim=-1) | |
| token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1) | |
| total_log_prob = token_log_probs.sum().item() | |
| # char_len = answer_len | |
| normalization_term = 1.0 | |
| if normalization_method == "token-length": | |
| normalization_term = answer_len | |
| elif normalization_method == "byte-length": | |
| decoded_text = tokenizer.decode(target_ids).strip() | |
| byte_len = len(decoded_text.encode("utf-8")) | |
| normalization_term = byte_len | |
| total_log_prob /= normalization_term | |
| return total_log_prob | |
| except Exception as e: | |
| logger.error(f"Error calculating likelihood for choice '{choice}': {str(e)}") | |
| return float("-inf") | |
| def evaluate_model_on_questions( | |
| model_path, | |
| questions, | |
| progress_callback=None, | |
| normalization_method: str = "token-length", | |
| ): | |
| """Evaluate a single model on all questions using likelihood-based scoring""" | |
| model_info = load_model_and_tokenizer( | |
| model_path, progress_callback=progress_callback | |
| ) | |
| if model_info is None: | |
| return [{"error": f"Failed to load model {model_path}"}] * len(questions) | |
| results = [] | |
| model = model_info["model"] | |
| tokenizer = model_info["tokenizer"] | |
| for i, question in enumerate(questions): | |
| try: | |
| # Calculate likelihood for each choice | |
| choice_likelihoods = {} | |
| choice_probs = {} | |
| for choice in question["choices"]: | |
| likelihood = calculate_choice_likelihood( | |
| model, tokenizer, question["question"], choice, normalization_method | |
| ) | |
| choice_likelihoods[choice] = likelihood | |
| # Convert log probabilities to probabilities for confidence scoring | |
| max_log_prob = max(choice_likelihoods.values()) | |
| choice_probs = { | |
| choice: torch.exp(torch.tensor(log_prob - max_log_prob)).item() | |
| for choice, log_prob in choice_likelihoods.items() | |
| } | |
| # Normalize probabilities | |
| total_prob = sum(choice_probs.values()) | |
| if total_prob > 0: | |
| choice_probs = { | |
| choice: prob / total_prob for choice, prob in choice_probs.items() | |
| } | |
| # Select the choice with highest likelihood | |
| predicted_choice = max( | |
| choice_likelihoods.keys(), key=lambda x: choice_likelihoods[x] | |
| ) | |
| is_correct = predicted_choice == question["correct_answer"] | |
| # Confidence is the probability of the selected choice | |
| confidence = choice_probs.get(predicted_choice, 0.0) | |
| results.append( | |
| { | |
| "question_idx": i, | |
| "predicted": predicted_choice, | |
| "correct": is_correct, | |
| "confidence": confidence, | |
| "choice_likelihoods": choice_likelihoods, | |
| "choice_probabilities": choice_probs, | |
| "raw_response": f"Likelihoods: {choice_likelihoods}", | |
| } | |
| ) | |
| if progress_callback: | |
| # Use remaining 80% for evaluation progress | |
| evaluation_progress = 0.2 + (i + 1) / len(questions) * 0.8 | |
| progress_callback( | |
| evaluation_progress, | |
| f"π Evaluating {model_path}: {i + 1}/{len(questions)} questions (likelihood-based)", | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error evaluating question {i} with {model_path}: {str(e)}") | |
| results.append( | |
| { | |
| "question_idx": i, | |
| "predicted": question["choices"][0] if question["choices"] else "", | |
| "correct": False, | |
| "confidence": 0.0, | |
| "choice_likelihoods": {}, | |
| "choice_probabilities": {}, | |
| "raw_response": f"Error: {str(e)}", | |
| } | |
| ) | |
| return results | |
| def run_evaluation( | |
| dataset_text, | |
| selected_predefined, | |
| toksuite_selector, | |
| custom_models_text="", | |
| delimiter: str = "\t", | |
| save_summary=False, | |
| normalization_method: str = "token-length", | |
| prefix: str = "", | |
| progress=gr.Progress(), | |
| ): | |
| import gc | |
| gc.collect() | |
| """Main evaluation function""" | |
| if not dataset_text.strip(): | |
| return ( | |
| "Please enter your dataset", | |
| "<p>No data provided</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", # markdown_summary | |
| "", # csv_summary | |
| ) | |
| # Parse custom models | |
| custom_models = [] | |
| if custom_models_text is None: | |
| custom_models_text = "" | |
| if custom_models_text.strip(): | |
| custom_models = [ | |
| model.strip() | |
| for model in custom_models_text.strip().split("\n") | |
| if model.strip() | |
| ] | |
| # Combine selected models | |
| all_models = [] | |
| # Add predefined models | |
| all_models.extend(selected_predefined) | |
| all_models.extend( | |
| [f"toksuite/{model}" for model in toksuite_selector] | |
| ) | |
| all_models.extend(custom_models) | |
| if not all_models: | |
| return ( | |
| "Please select at least one model or add custom models", | |
| "<p>No models selected</p>", | |
| None, | |
| None, | |
| gr.update(visible=False), | |
| "", | |
| "", | |
| ) | |
| # Parse dataset | |
| questions, parse_error = parse_dataset(dataset_text, delimiter=delimiter) | |
| if parse_error: | |
| return ( | |
| f"Dataset parsing error:\n{parse_error}", | |
| "<p>Failed to parse dataset</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", | |
| "", | |
| ) | |
| if not questions: | |
| return ( | |
| "No valid questions found in dataset", | |
| "<p>No questions to evaluate</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", | |
| "", | |
| ) | |
| # Run evaluation | |
| progress(0, "Starting evaluation...") | |
| results = {} | |
| total_steps = len(all_models) * len(questions) | |
| current_step = 0 | |
| summary_md = create_summary_markdown({}) | |
| for model_path in all_models: | |
| display_name = model_path.split("/")[-1] if "/" in model_path else model_path | |
| try: | |
| def model_progress(p, msg): | |
| nonlocal current_step | |
| current_step = int(p * len(questions)) | |
| overall_progress = current_step / total_steps | |
| progress(overall_progress, msg) | |
| model_results = evaluate_model_on_questions( | |
| model_path, questions, model_progress, normalization_method | |
| ) | |
| results[display_name] = model_results | |
| except Exception as e: | |
| logger.error(f"Failed to evaluate {display_name}: {str(e)}") | |
| results[display_name] = [{"error": str(e)}] * len(questions) | |
| # Clean up GPU memory | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # Generate outputs | |
| summary_stats = generate_summary_stats(questions, results) | |
| summary_md = create_summary_markdown(summary_stats) | |
| detailed_html = create_detailed_results_html(questions, results) | |
| accuracy_chart = create_accuracy_chart(summary_stats) | |
| confidence_chart = create_confidence_chart(results) | |
| # Generate compact summaries | |
| markdown_summary = generate_compact_summary_markdown( | |
| questions, results, summary_stats | |
| ) | |
| # csv_summary = generate_csv_summary(questions, results, summary_stats) | |
| csv_summary = generate_excel_summary(questions, results, summary_stats) | |
| slurm_id = os.environ.get("SLURM_JOB_ID", "") | |
| if save_summary and slurm_id: | |
| file_name = f"summaries/{slurm_id}_summary_{time()}.md" | |
| if prefix: | |
| file_name = f"summaries/{slurm_id}_{prefix}_summary_{time()}.md" | |
| with open(file_name, "w") as f: | |
| f.write(markdown_summary) | |
| return ( | |
| summary_md, | |
| detailed_html, | |
| accuracy_chart, | |
| confidence_chart, | |
| gr.update(visible=True), | |
| markdown_summary, | |
| csv_summary, | |
| ) | |
| def generate_summary_stats(questions, results): | |
| """Generate summary statistics for all models""" | |
| summary = {} | |
| for model, model_results in results.items(): | |
| if not model_results or "error" in model_results[0]: | |
| summary[model] = { | |
| "accuracy": 0.0, | |
| "correct": 0, | |
| "total": len(questions), | |
| "avg_confidence": 0.0, | |
| "error": model_results[0].get("error", "Unknown error") | |
| if model_results | |
| else "No results", | |
| } | |
| continue | |
| correct_count = sum(1 for r in model_results if r.get("correct", False)) | |
| total_count = len(model_results) | |
| accuracy = correct_count / total_count if total_count > 0 else 0 | |
| # Calculate average confidence | |
| avg_confidence = ( | |
| sum(r.get("confidence", 0) for r in model_results) / total_count | |
| if total_count > 0 | |
| else 0 | |
| ) | |
| summary[model] = { | |
| "accuracy": accuracy, | |
| "correct": correct_count, | |
| "total": total_count, | |
| "avg_confidence": avg_confidence, | |
| } | |
| return summary | |
| def create_summary_markdown(summary_stats): | |
| """Create markdown summary of results""" | |
| if not summary_stats: | |
| return "No results available" | |
| # Sort by accuracy | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1]["accuracy"], reverse=True | |
| ) | |
| lines = ["## π Model Performance Summary\n"] | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"β **{model}**: Error - {stats['error']}") | |
| continue | |
| accuracy_pct = stats["accuracy"] * 100 | |
| medal = "π₯" if i == 0 else "π₯" if i == 1 else "π₯" if i == 2 else f"{i + 1}." | |
| lines.append( | |
| f"{medal} **{model}**: {accuracy_pct:.1f}% " | |
| f"({stats['correct']}/{stats['total']} correct, " | |
| f"avg confidence: {stats['avg_confidence']:.2f})" | |
| ) | |
| return "\n".join(lines) | |
| # CSS for universal text handling | |
| universal_css = """ | |
| .universal-text textarea { | |
| direction: auto !important; | |
| text-align: start !important; | |
| unicode-bidi: plaintext !important; | |
| font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', | |
| 'Roboto', 'Arial', 'Noto Sans', sans-serif !important; | |
| } | |
| /* Better handling for mixed content */ | |
| .universal-text textarea:focus { | |
| unicode-bidi: plaintext !important; | |
| } | |
| """ | |
| def create_detailed_results_html(questions, results): | |
| """Create detailed HTML results for each question""" | |
| if not questions or not results: | |
| return "<p>No detailed results available</p>" | |
| html_parts = [ | |
| """ | |
| <style> | |
| .question-card { | |
| background: white; | |
| border-radius: 12px; | |
| padding: 20px; | |
| margin-bottom: 20px; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
| border-left: 5px solid #667eea; | |
| } | |
| .question-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| margin-bottom: 15px; | |
| } | |
| .question-number { | |
| background: linear-gradient(135deg, #667eea, #764ba2); | |
| color: white; | |
| padding: 6px 12px; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| font-size: 14px; | |
| } | |
| .question-text { | |
| font-weight: 600; | |
| font-size: 16px; | |
| margin: 15px 0; | |
| color: #2d3748; | |
| } | |
| .choices { | |
| background: #f8fafc; | |
| border-radius: 8px; | |
| padding: 15px; | |
| margin: 10px 0; | |
| } | |
| .choice { | |
| margin: 8px 0; | |
| color: #4a5568; | |
| } | |
| .correct-answer { | |
| background: linear-gradient(135deg, #c6f6d5, #9ae6b4); | |
| border-left: 4px solid #48bb78; | |
| border-radius: 6px; | |
| padding: 12px; | |
| margin: 10px 0; | |
| font-weight: 600; | |
| color: #22543d; | |
| } | |
| .model-results { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); | |
| gap: 12px; | |
| margin-top: 15px; | |
| } | |
| .model-result { | |
| padding: 12px; | |
| border-radius: 8px; | |
| text-align: center; | |
| font-weight: 600; | |
| transition: transform 0.2s ease; | |
| } | |
| .model-result:hover { | |
| transform: scale(1.02); | |
| } | |
| .result-correct { | |
| background: linear-gradient(135deg, #c6f6d5, #9ae6b4); | |
| color: #22543d; | |
| border: 2px solid #48bb78; | |
| } | |
| .result-incorrect { | |
| background: linear-gradient(135deg, #fed7d7, #fca5a5); | |
| color: #742a2a; | |
| border: 2px solid #e53e3e; | |
| } | |
| .result-error { | |
| background: linear-gradient(135deg, #fbb6ce, #f687b3); | |
| color: #744210; | |
| border: 2px solid #d69e2e; | |
| } | |
| .raw-response { | |
| font-size: 10px; | |
| margin-top: 4px; | |
| opacity: 0.7; | |
| font-family: monospace; | |
| } | |
| </style> | |
| """ | |
| ] | |
| for q_idx, question in enumerate(questions): | |
| html_parts.append(f""" | |
| <div class="question-card"> | |
| <div class="question-header"> | |
| <span class="question-number">Q{q_idx + 1}</span> | |
| </div> | |
| <div class="question-text">{question["question"]}</div> | |
| <div class="choices"> | |
| <strong>Choices:</strong><br> | |
| {" | ".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(question["choices"]))} | |
| </div> | |
| <div class="correct-answer"> | |
| <strong>β Correct Answer:</strong> {question["correct_answer"]} | |
| </div> | |
| <div class="model-results"> | |
| """) | |
| # Add results for each model | |
| for model, model_results in results.items(): | |
| if q_idx < len(model_results): | |
| result = model_results[q_idx] | |
| if "error" in result: | |
| html_parts.append(f""" | |
| <div class="model-result result-error"> | |
| <div>β οΈ {model}</div> | |
| <div style="font-size: 12px; margin-top: 4px;"> | |
| Error occurred | |
| </div> | |
| <div class="raw-response">{result.get("raw_response", "Unknown error")}</div> | |
| </div> | |
| """) | |
| else: | |
| result_class = ( | |
| "result-correct" | |
| if result.get("correct", False) | |
| else "result-incorrect" | |
| ) | |
| icon = "β " if result.get("correct", False) else "β" | |
| html_parts.append(f""" | |
| <div class="model-result {result_class}"> | |
| <div>{icon} {model}</div> | |
| <div style="font-size: 12px; margin-top: 4px;"> | |
| "{result.get("predicted", "No prediction")}" | |
| </div> | |
| <div class="raw-response">Raw: "{result.get("raw_response", "")}"</div> | |
| </div> | |
| """) | |
| html_parts.append(""" | |
| </div> | |
| </div> | |
| """) | |
| return "".join(html_parts) | |
| def create_accuracy_chart(summary_stats): | |
| """Create accuracy comparison chart""" | |
| if not summary_stats: | |
| return None | |
| models = [] | |
| accuracies = [] | |
| for model, stats in summary_stats.items(): | |
| if "error" not in stats: | |
| models.append(model) | |
| accuracies.append(stats["accuracy"] * 100) | |
| if not models: | |
| return None | |
| fig = go.Figure( | |
| data=[ | |
| go.Bar( | |
| x=models, | |
| y=accuracies, | |
| marker_color="lightblue", | |
| text=[f"{acc:.1f}%" for acc in accuracies], | |
| textposition="auto", | |
| ) | |
| ] | |
| ) | |
| fig.update_layout( | |
| title="Model Accuracy Comparison", | |
| xaxis_title="Models", | |
| yaxis_title="Accuracy (%)", | |
| template="plotly_white", | |
| showlegend=False, | |
| ) | |
| return fig | |
| def create_confidence_chart(results): | |
| """Create confidence distribution chart""" | |
| if not results: | |
| return None | |
| data = [] | |
| for model, model_results in results.items(): | |
| for result in model_results: | |
| if "error" not in result and "confidence" in result: | |
| data.append( | |
| { | |
| "Model": model, | |
| "Confidence": result["confidence"], | |
| "Correct": "Correct" | |
| if result.get("correct", False) | |
| else "Incorrect", | |
| } | |
| ) | |
| if not data: | |
| return None | |
| df = pd.DataFrame(data) | |
| fig = px.box( | |
| df, | |
| x="Model", | |
| y="Confidence", | |
| color="Correct", | |
| title="Confidence Distribution by Model and Correctness", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| def generate_compact_summary_markdown(questions, results, summary_stats): | |
| """Generate a compact markdown summary table for copy-pasting""" | |
| if not summary_stats or not questions or not results: | |
| return "No data available for summary" | |
| lines = ["# Model Performance Summary\n"] | |
| # Accuracy Summary Table | |
| lines.append("## π Accuracy Summary\n") | |
| lines.append("| Rank | Model | Accuracy | Correct | Total | Avg Confidence |") | |
| lines.append("|------|-------|----------|---------|-------|----------------|") | |
| # Sort by accuracy | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True | |
| ) | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"| {i + 1} | {model} | ERROR | - | - | - |") | |
| else: | |
| accuracy_pct = stats["accuracy"] * 100 | |
| lines.append( | |
| f"| {i + 1} | {model} | {accuracy_pct:.1f}% | {stats['correct']} | {stats['total']} | {stats['avg_confidence']:.3f} |" | |
| ) | |
| lines.append("\n") | |
| # Detailed Results Table | |
| lines.append("## π Detailed Question Results\n") | |
| # Get all model names for header | |
| model_names = list(results.keys()) | |
| header = "| Q# | Question | Correct Answer |" + "".join( | |
| [f" {model} |" for model in model_names] | |
| ) | |
| separator = ( | |
| "|" | |
| + "|".join(["-" * (len(col.strip()) + 1) for col in header.split("|")[1:-1]]) | |
| + "|" | |
| ) | |
| lines.append(header) | |
| lines.append(separator) | |
| for q_idx, question in enumerate(questions): | |
| # Truncate long questions for table readability | |
| question_text = question["question"] | |
| if len(question_text) > 50: | |
| question_text = question_text[:47] + "..." | |
| row = f"| {q_idx + 1} | {question_text} | {question['correct_answer']} |" | |
| for model in model_names: | |
| if q_idx < len(results[model]) and "error" not in results[model][q_idx]: | |
| result = results[model][q_idx] | |
| predicted = result.get("predicted", "N/A") | |
| is_correct = result.get("correct", False) | |
| confidence = result.get("confidence", 0) | |
| # Add emoji for visual feedback | |
| status_emoji = "β " if is_correct else "β" | |
| row += f" {status_emoji} {predicted} ({confidence:.2f}) |" | |
| else: | |
| row += " β οΈ ERROR |" | |
| lines.append(row) | |
| lines.append("\n") | |
| # Legend | |
| lines.append("### Legend") | |
| lines.append("- β = Correct answer") | |
| lines.append("- β = Incorrect answer") | |
| lines.append("- β οΈ = Error occurred") | |
| lines.append("- Numbers in parentheses = Confidence score") | |
| return "\n".join(lines) | |
| def generate_csv_summary(questions, results, summary_stats): | |
| """Generate CSV format summary""" | |
| # TODO: add CSV file download if necessary | |
| if not summary_stats or not questions or not results: | |
| return "No data available" | |
| lines = [] | |
| # Accuracy summary header | |
| lines.append("# ACCURACY SUMMARY") | |
| lines.append("Rank,Model,Accuracy_Percent,Correct,Total,Avg_Confidence") | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True | |
| ) | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"{i + 1},{model},ERROR,-,-,-") | |
| else: | |
| accuracy_pct = stats["accuracy"] * 100 | |
| lines.append( | |
| f"{i + 1},{model},{accuracy_pct:.1f},{stats['correct']},{stats['total']},{stats['avg_confidence']:.3f}" | |
| ) | |
| lines.append("") | |
| lines.append("# DETAILED RESULTS") | |
| # Header for detailed results | |
| model_names = list(results.keys()) | |
| header = "Question_ID,Question,Correct_Answer," + ",".join( | |
| [ | |
| f"{model}_Predicted,{model}_Correct,{model}_Confidence" | |
| for model in model_names | |
| ] | |
| ) | |
| lines.append(header) | |
| # Detailed results | |
| for q_idx, question in enumerate(questions): | |
| row = f'{q_idx + 1},"{question["question"]}",{question["correct_answer"]}' | |
| for model in model_names: | |
| if q_idx < len(results[model]) and "error" not in results[model][q_idx]: | |
| result = results[model][q_idx] | |
| predicted = result.get("predicted", "N/A") | |
| is_correct = str(result.get("correct", False)) | |
| confidence = result.get("confidence", 0) | |
| row += f",{predicted},{is_correct},{confidence:.3f}" | |
| else: | |
| row += ",ERROR,FALSE,0" | |
| lines.append(row) | |
| return "\n".join(lines) | |
| def generate_excel_summary(questions, results, summary_stats): | |
| """Generate Excel format summary""" | |
| # TODO: add Excel file download if necessary | |
| if not summary_stats or not questions or not results: | |
| return "No data available" | |
| lines = [] | |
| # Header for detailed results | |
| model_names = list(results.keys()) | |
| header = "\t".join(model_names) | |
| lines.append(header) | |
| # Detailed results | |
| for q_idx, question in enumerate(questions): | |
| # row = f'{q_idx + 1},"{question["question"]}",{question["correct_answer"]}' | |
| row = "" | |
| for model in model_names: | |
| if q_idx < len(results[model]) and "error" not in results[model][q_idx]: | |
| result = results[model][q_idx] | |
| predicted = result.get("predicted", "N/A") | |
| is_correct = result.get("correct", False) | |
| confidence = result.get("confidence", 0) | |
| # row += is_correct | |
| status_emoji = "β " if is_correct else "β" | |
| row += f"{status_emoji} {predicted} ({confidence:.2f})\t" | |
| else: | |
| row += "ERROR\t" | |
| lines.append(row) | |
| return "\n".join(lines) | |
| # Sample datasets for quick testing | |
| SAMPLE_DATASETS = { | |
| "Custom (enter below)": "", | |
| "LP": """In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland | |
| In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland | |
| In which country is Llanfair PG located? Wales Germany France Scotland""", | |
| "Simple Math": """What is 2+2? 4 3 2 5 | |
| What is 5*3? 15 12 16 18 | |
| What is 10-7? 3 7 4 2 | |
| What is 8/2? 4 3 2 5""", | |
| "World Capitals": """What is the capital of France? Paris London Berlin Rome | |
| What is the capital of Japan? Tokyo Seoul Beijing Bangkok | |
| What is the capital of Brazil? BrasΓlia Rio de Janeiro SΓ£o Paulo Salvador | |
| What is the capital of Australia? Canberra Sydney Melbourne Perth""", | |
| "Science Quiz": """What is the chemical symbol for gold?,Au,Ag,Ca,K | |
| Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars | |
| What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s | |
| What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""", | |
| } | |
| # Custom CSS | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .sample-text { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| font-size: 12px; | |
| } | |
| .universal-text textarea { | |
| direction: ltr !important; | |
| text-align: left !important; | |
| unicode-bidi: bidi-override !important; | |
| font-family: 'Courier New', monospace !important; | |
| white-space: pre !important; | |
| } | |
| /* Reset direction after paste */ | |
| .universal-text textarea:focus { | |
| direction: auto !important; | |
| unicode-bidi: plaintext !important; | |
| } | |
| # .universal-text textarea { | |
| # direction: auto !important; | |
| # text-align: start !important; | |
| # unicode-bidi: plaintext !important; | |
| # font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', | |
| # 'Roboto', 'Arial', 'Noto Sans', sans-serif !important; | |
| # } | |
| # /* Better handling for mixed content */ | |
| # .universal-text textarea:focus { | |
| # unicode-bidi: plaintext !important; | |
| # } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="π€ Model Performance Comparison", theme=gr.themes.Soft(), css=css | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π€ Model Performance Comparison Tool | |
| Compare LLM performance on multiple-choice questions using Hugging Face models. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| **Format**: Each line should have: `Question,Correct Answer,Choice1,Choice2,Choice3` (No header). Use commas or tabs as separators. | |
| π‘ **Features**: | |
| - Model evaluation using HuggingFace transformers | |
| - Support for custom models via HF model paths | |
| - Detailed question-by-question results | |
| - Performance charts and statistics | |
| """) | |
| device_str = "a single GPU" if torch.cuda.is_available() else "CPU" | |
| info_str = ( | |
| lambda: f""" | |
| **β οΈ Note**: | |
| - Larger models require more GPU memory, currently we only run on {device_str} | |
| - First run will download models (may take time) | |
| - Models are cached for subsequent runs. Currently loaded models: {list(model_cache.keys()) if model_cache else "None"} | |
| """ | |
| ) | |
| info_md = gr.Markdown(info_str()) | |
| with gr.Column(scale=2): | |
| # Sample dataset selector | |
| sample_selector = gr.Dropdown( | |
| choices=list(SAMPLE_DATASETS.keys()), | |
| value="Custom (enter below)", | |
| label="Choose sample dataset or enter your own", | |
| interactive=True, | |
| ) | |
| # Dataset input | |
| dataset_input = gr.Textbox( | |
| label="Dataset (CSV/TSV format)", | |
| placeholder="""Enter your dataset here... | |
| Example format: | |
| (Question,Correct Answer,Choice1,Choice2,Choice3) | |
| What is 2+2?,4,3,2,5 | |
| What is the capital of France?,Paris,London,Berlin,Paris""", | |
| lines=8, | |
| max_lines=15, | |
| elem_classes=["universal-text"], | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| normalization_method = gr.Radio( | |
| label="Normalization Method", | |
| choices=["token-length", "byte-length", "none"], | |
| value="token-length", | |
| info="Method to normalize log-likelihoods when scoring answers", | |
| ) | |
| with gr.Column(scale=1): | |
| # with gr.Accordion("Delimiter Options"): | |
| delimiter_selector = gr.Textbox( | |
| info="Delimiter used in the dataset (e.g., comma or tab)", | |
| label="Delimiter", | |
| placeholder="Enter a delimiter, e.g., , or \\t", | |
| value="\\t", # default | |
| lines=1, | |
| ) | |
| with gr.Column(scale=1): | |
| save_summary_checkbox = gr.Checkbox(value=True) | |
| prefix = gr.Textbox(visible=False, value="") | |
| slurm_id = os.environ.get("SLURM_JOB_ID", "") | |
| if slurm_id: | |
| save_summary_checkbox = gr.Checkbox( | |
| info=f"To save the summary markdown file, check the box below. The filename will be prefixed with {slurm_id}:", | |
| label="Save summary markdown to file", | |
| value=False, | |
| # info="If checked, saves a markdown summary file with SLURM_JOB_ID prefix", | |
| ) | |
| prefix = gr.Textbox( | |
| label="Filename Prefix", | |
| placeholder=f"The file will be saved at summaries/{slurm_id}_{prefix}_TIME.md will be used by default", | |
| value="", | |
| interactive=True, | |
| visible=True, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| toksuite_selector = gr.CheckboxGroup( | |
| label="Select toksuite models", | |
| choices=TOKSUITE_MODELS, | |
| value=TOKSUITE_MODELS, | |
| interactive=True, | |
| info="These models share the same initialization and training source but differ only in their tokenizers. See [r-three/toksuite](https://huggingface.co/collections/r-three/toksuite-68ae7490c151341d78423295) for details.", | |
| ) | |
| with gr.Column(scale=3): | |
| # Model selection | |
| with gr.Tabs(): | |
| with gr.TabItem("π€ Predefined Models"): | |
| industry_selector = gr.CheckboxGroup( | |
| # choices=PREDEFINED_MODELS, | |
| choices=INDUSTRY_MODELS, | |
| # value=[x for x in PREDEFINED_MODELS if "r-three" in x], | |
| value=[], | |
| label="Select from industry models", | |
| interactive=True, | |
| ) | |
| with gr.TabItem("β Custom Models"): | |
| custom_models_input = gr.Textbox( | |
| label="Custom HuggingFace Model Paths", | |
| placeholder="""Enter HuggingFace model paths (one per line): | |
| microsoft/DialoGPT-medium | |
| bigscience/bloom-560m""", | |
| lines=5, | |
| info="Add any HuggingFace model path. One model per line.", | |
| ) | |
| gr.Markdown(""" | |
| **Examples of valid model paths**: | |
| - `microsoft/DialoGPT-medium` | |
| - `bigscience/bloom-560m` | |
| - `facebook/opt-350m` | |
| - Your own fine-tuned models! | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Column(scale=2): | |
| # Evaluate button | |
| evaluate_btn = gr.Button("β‘ Run Evaluation", variant="primary", scale=1) | |
| with gr.Column(scale=1): | |
| pass | |
| # Results section | |
| with gr.Column(visible=True) as results_section: | |
| gr.Markdown("## π Results") | |
| summary_output = gr.Markdown( | |
| value="Results will appear here...", label="Performance Summary" | |
| ) | |
| with gr.Row(): | |
| accuracy_plot = gr.Plot(label="Accuracy Comparison") | |
| confidence_plot = gr.Plot(label="Confidence Analysis") | |
| # NEW: Export Section | |
| gr.Markdown("## π₯ Export Results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π Markdown Table Format") | |
| markdown_summary_output = gr.Textbox( | |
| label="Markdown Summary (Copy & Paste Ready)", | |
| lines=15, | |
| max_lines=25, | |
| show_copy_button=True, | |
| interactive=False, | |
| value="", | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π Excel Format (Tab separated)") | |
| # gr.Markdown("### π CSV Format") | |
| csv_summary_output = gr.Textbox( | |
| label="Excel Summary (Copy & Paste Ready, No additional formatting)", | |
| # label="CSV Summary (Copy & Paste Ready)", | |
| lines=15, | |
| max_lines=25, | |
| show_copy_button=True, | |
| interactive=False, | |
| value="", | |
| ) | |
| detailed_results = gr.HTML( | |
| value="<p>Detailed results will appear here...</p>", | |
| label="Detailed Question-by-Question Results", | |
| ) | |
| # Event handlers | |
| def update_dataset_from_sample(sample_name): | |
| if sample_name in SAMPLE_DATASETS: | |
| return gr.update(value=SAMPLE_DATASETS[sample_name]) | |
| return gr.update() | |
| sample_selector.change( | |
| fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input | |
| ) | |
| def update_info_md(): | |
| info_md.value = info_str() | |
| return gr.update(value=info_str()) | |
| evaluate_btn.click(fn=update_info_md, inputs=None, outputs=info_md) | |
| evaluate_btn.click( | |
| fn=run_evaluation, | |
| inputs=[ | |
| dataset_input, | |
| industry_selector, | |
| toksuite_selector, | |
| custom_models_input, | |
| delimiter_selector, | |
| save_summary_checkbox, | |
| normalization_method, | |
| prefix, | |
| ], | |
| outputs=[ | |
| summary_output, | |
| detailed_results, | |
| accuracy_plot, | |
| confidence_plot, | |
| results_section, | |
| markdown_summary_output, | |
| csv_summary_output, | |
| ], | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About Model Evaluation | |
| This tool loads and runs HuggingFace models for evaluation: | |
| **ποΈ How it works**: | |
| - Downloads models from HuggingFace Hub | |
| - Formats questions as prompts for each model | |
| - Runs likelihood based evaluation | |
| **β‘ Performance Tips**: | |
| - Use smaller models for testing | |
| - Larger models (7B+) require significant GPU memory | |
| - Models are cached after first load | |
| **π§ Supported Models**: | |
| - Any HuggingFace autoregressive language model | |
| - Both instruction-tuned and base models | |
| - Custom fine-tuned models via HF paths | |
| """) | |
| if __name__ == "__main__": | |
| import argparse | |
| argparser = argparse.ArgumentParser() | |
| argparser.add_argument("--share", action="store_true", default=True) | |
| args = argparser.parse_args() | |
| demo.launch(share=args.share) | |