every_eval_ever_space / hf_operations.py
deepmage121's picture
interim update in parser
4b56c7c
"""
HuggingFace Operations: Upload data, create PRs, validate schemas.
"""
from huggingface_hub import HfApi, login
import pandas as pd
import json
from pathlib import Path
from jsonschema import validate, ValidationError, Draft7Validator
# Load schema once at module level
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
with open(SCHEMA_PATH, "r") as f:
EVAL_SCHEMA = json.load(f)
def validate_json_against_schema(json_data):
"""
Validate a JSON object against eval.schema.json.
Args:
json_data: Dict containing the evaluation data
Returns:
(bool, str): (is_valid, error_message)
"""
try:
validate(instance=json_data, schema=EVAL_SCHEMA)
return True, "Schema validation passed"
except ValidationError as e:
# Extract the most relevant error message
error_path = " β†’ ".join(str(p) for p in e.path) if e.path else "root"
return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
except Exception as e:
return False, f"❌ Validation error: {str(e)}"
def upload_to_hf_dataset(parquet_file, split_name, repo_id):
"""
Upload a parquet file as a new split to the HF dataset.
Args:
parquet_file: Path to parquet file
split_name: Name of the split (leaderboard name)
repo_id: HuggingFace dataset repository ID
"""
# TODO: Implement upload logic
pass
def check_hf_authentication():
"""
Check if user is authenticated with HuggingFace.
Returns:
(bool, str): (is_authenticated, username or error_message)
"""
try:
api = HfApi()
user_info = api.whoami()
return True, user_info["name"]
except Exception as e:
return False, "Not authenticated. Run: huggingface-cli login"
def check_duplicate_pr_exists(leaderboard_name, repo_id):
"""
Check if a PR already exists for this leaderboard.
Args:
leaderboard_name: Name of the leaderboard
repo_id: HuggingFace dataset repository ID
Returns:
(bool, str or None): (exists, pr_url if exists)
"""
try:
api = HfApi()
discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
# Check for open PRs with matching title
pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
for discussion in discussions:
if discussion.is_pull_request and discussion.status == "open":
if pr_title_pattern in discussion.title.lower():
pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
return True, pr_url
return False, None
except Exception as e:
# If we can't check, assume no duplicate (fail open)
print(f"Warning: Could not check for duplicate PRs: {e}")
return False, None
def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
"""
Create a pull request to add a new leaderboard split.
Args:
leaderboard_name: Name of the new leaderboard
parquet_file: Path to parquet file
repo_id: HuggingFace dataset repository ID
Returns:
(success, pr_url or error_message)
"""
# 1. Check authentication
is_auth, auth_result = check_hf_authentication()
if not is_auth:
return False, f"❌ {auth_result}"
# 2. Check for duplicate PR
has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
if has_duplicate:
return False, f"⚠️ PR already exists: {duplicate_url}"
# 3. Validate parquet file exists and has data
parquet_path = Path(parquet_file)
if not parquet_path.exists():
return False, "❌ Parquet file not found"
df = pd.read_parquet(parquet_file)
if len(df) == 0:
return False, "❌ Parquet file is empty"
# 4. Create PR
try:
api = HfApi()
# Upload the parquet file to the branch
commit_message = f"Add new leaderboard: {leaderboard_name}"
# Upload file and create PR
commit_info = api.upload_file(
path_or_fileobj=parquet_file,
path_in_repo=f"data/{leaderboard_name}.parquet",
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
create_pr=True,
)
# Extract PR URL from commit info
pr_url = (
commit_info.pr_url
if hasattr(commit_info, "pr_url")
else f"https://huggingface.co/datasets/{repo_id}/discussions"
)
return True, f"PR created ({len(df)} rows): {pr_url}"
except Exception as e:
return False, f"❌ Failed to create PR: {str(e)}"
def validate_schema(parquet_file):
"""
Validate that a parquet file matches the expected schema.
Args:
parquet_file: Path to parquet file to validate
Returns:
(bool, str): (is_valid, error_message)
"""
try:
df = pd.read_parquet(parquet_file)
# Required columns
required_cols = [
"_leaderboard",
"_developer",
"_model",
"_uuid",
"schema_version",
"evaluation_id",
"retrieved_timestamp",
"source_data",
"evaluation_source_name",
"evaluation_source_type",
"source_organization_name",
"evaluator_relationship",
"model_name",
"model_id",
"model_developer",
"evaluation_results",
]
missing = [col for col in required_cols if col not in df.columns]
if missing:
return False, f"Missing required columns: {', '.join(missing)}"
# Check data types (all should be strings)
for col in df.columns:
if df[col].dtype not in ["object", "string"]:
return (
False,
f"Column '{col}' has wrong type: {df[col].dtype} (expected string)",
)
return True, "Schema validation passed"
except Exception as e:
return False, f"Validation error: {str(e)}"
def export_to_json(parquet_file, output_dir):
"""
Export parquet data back to JSON files.
Uses the parquet_to_folder function from json_to_parquet.py
Args:
parquet_file: Path to parquet file
output_dir: Directory to write JSON files to
"""
from json_to_parquet import parquet_to_folder
parquet_to_folder(parquet_file, output_dir)