Spaces:

evaleval
/

every_eval_ever_space

Running

App Files Files Community

every_eval_ever_space / hf_operations.py

deepmage121

interim update in parser

4b56c7c 2 days ago

raw

history blame contribute delete

6.6 kB

	"""
	HuggingFace Operations: Upload data, create PRs, validate schemas.
	"""

	from huggingface_hub import HfApi, login
	import pandas as pd
	import json
	from pathlib import Path
	from jsonschema import validate, ValidationError, Draft7Validator


	# Load schema once at module level
	SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
	with open(SCHEMA_PATH, "r") as f:
	EVAL_SCHEMA = json.load(f)


	def validate_json_against_schema(json_data):
	"""
	Validate a JSON object against eval.schema.json.

	Args:
	json_data: Dict containing the evaluation data

	Returns:
	(bool, str): (is_valid, error_message)
	"""
	try:
	validate(instance=json_data, schema=EVAL_SCHEMA)
	return True, "Schema validation passed"
	except ValidationError as e:
	# Extract the most relevant error message
	error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
	return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
	except Exception as e:
	return False, f"❌ Validation error: {str(e)}"


	def upload_to_hf_dataset(parquet_file, split_name, repo_id):
	"""
	Upload a parquet file as a new split to the HF dataset.

	Args:
	parquet_file: Path to parquet file
	split_name: Name of the split (leaderboard name)
	repo_id: HuggingFace dataset repository ID
	"""
	# TODO: Implement upload logic
	pass


	def check_hf_authentication():
	"""
	Check if user is authenticated with HuggingFace.

	Returns:
	(bool, str): (is_authenticated, username or error_message)
	"""
	try:
	api = HfApi()
	user_info = api.whoami()
	return True, user_info["name"]
	except Exception as e:
	return False, "Not authenticated. Run: huggingface-cli login"


	def check_duplicate_pr_exists(leaderboard_name, repo_id):
	"""
	Check if a PR already exists for this leaderboard.

	Args:
	leaderboard_name: Name of the leaderboard
	repo_id: HuggingFace dataset repository ID

	Returns:
	(bool, str or None): (exists, pr_url if exists)
	"""
	try:
	api = HfApi()
	discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")

	# Check for open PRs with matching title
	pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
	for discussion in discussions:
	if discussion.is_pull_request and discussion.status == "open":
	if pr_title_pattern in discussion.title.lower():
	pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
	return True, pr_url

	return False, None
	except Exception as e:
	# If we can't check, assume no duplicate (fail open)
	print(f"Warning: Could not check for duplicate PRs: {e}")
	return False, None


	def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
	"""
	Create a pull request to add a new leaderboard split.

	Args:
	leaderboard_name: Name of the new leaderboard
	parquet_file: Path to parquet file
	repo_id: HuggingFace dataset repository ID

	Returns:
	(success, pr_url or error_message)
	"""
	# 1. Check authentication
	is_auth, auth_result = check_hf_authentication()
	if not is_auth:
	return False, f"❌ {auth_result}"

	# 2. Check for duplicate PR
	has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
	if has_duplicate:
	return False, f"⚠️ PR already exists: {duplicate_url}"

	# 3. Validate parquet file exists and has data
	parquet_path = Path(parquet_file)
	if not parquet_path.exists():
	return False, "❌ Parquet file not found"

	df = pd.read_parquet(parquet_file)
	if len(df) == 0:
	return False, "❌ Parquet file is empty"

	# 4. Create PR
	try:
	api = HfApi()

	# Upload the parquet file to the branch
	commit_message = f"Add new leaderboard: {leaderboard_name}"

	# Upload file and create PR
	commit_info = api.upload_file(
	path_or_fileobj=parquet_file,
	path_in_repo=f"data/{leaderboard_name}.parquet",
	repo_id=repo_id,
	repo_type="dataset",
	commit_message=commit_message,
	create_pr=True,
	)

	# Extract PR URL from commit info
	pr_url = (
	commit_info.pr_url
	if hasattr(commit_info, "pr_url")
	else f"https://huggingface.co/datasets/{repo_id}/discussions"
	)

	return True, f"PR created ({len(df)} rows): {pr_url}"

	except Exception as e:
	return False, f"❌ Failed to create PR: {str(e)}"


	def validate_schema(parquet_file):
	"""
	Validate that a parquet file matches the expected schema.

	Args:
	parquet_file: Path to parquet file to validate

	Returns:
	(bool, str): (is_valid, error_message)
	"""
	try:
	df = pd.read_parquet(parquet_file)

	# Required columns
	required_cols = [
	"_leaderboard",
	"_developer",
	"_model",
	"_uuid",
	"schema_version",
	"evaluation_id",
	"retrieved_timestamp",
	"source_data",
	"evaluation_source_name",
	"evaluation_source_type",
	"source_organization_name",
	"evaluator_relationship",
	"model_name",
	"model_id",
	"model_developer",
	"evaluation_results",
	]

	missing = [col for col in required_cols if col not in df.columns]
	if missing:
	return False, f"Missing required columns: {', '.join(missing)}"

	# Check data types (all should be strings)
	for col in df.columns:
	if df[col].dtype not in ["object", "string"]:
	return (
	False,
	f"Column '{col}' has wrong type: {df[col].dtype} (expected string)",
	)

	return True, "Schema validation passed"

	except Exception as e:
	return False, f"Validation error: {str(e)}"


	def export_to_json(parquet_file, output_dir):
	"""
	Export parquet data back to JSON files.
	Uses the parquet_to_folder function from json_to_parquet.py

	Args:
	parquet_file: Path to parquet file
	output_dir: Directory to write JSON files to
	"""
	from json_to_parquet import parquet_to_folder

	parquet_to_folder(parquet_file, output_dir)