Spaces:

saeedfarzi
/

trail_leaderboard

Sleeping

App Files Files Community

trail_leaderboard / get_model_info.py

Sfarzi

Initial clone with modifications

800b2b6 about 2 months ago

raw

history blame contribute delete

5.39 kB

	"""
	MODEL METADATA EXTRACTOR

	This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
	extracts model identifiers, retrieves detailed metadata from HuggingFace
	and saves the information as structured JSON files (output_folder).

	Input: Directory containing .out files from lm-eval-harness
	Output: Directory with JSON files containing model metadata
	"""

	# Example input file format (lm-eval-harness output):
	'''
	hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
	\| Tasks \|Version\|Filter\|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|------------------------\|------:\|------\|-----:\|--------\|---\|-----:\|---\|------\|
	\|evalita-mp \| 1\|none \| \|acc \|↑ \|0.5605\|± \|0.0052\|
	...
	Job completed
	'''

	# Example output JSON format:
	'''
	{
	"model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
	"base_model": "LlamaForCausalLM",
	"revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
	"submitted_time": "2024-04-29 09:34:12+00:00",
	"num_params_billion": 8.030261248,
	"language": "en_it"
	}
	'''

	import os
	import re
	import json
	from huggingface_hub import HfApi

	# Configures the Hugging Face token (if needed)
	# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
	api = HfApi()

	# Directory paths
	# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
	input_folder = "../evalita_llm_models_output/"
	# output_folder: Directory where JSON files with model characteristics will be saved.
	output_folder = "../evalita_llm_requests/"

	# Creates the output folder if it doesn't exist
	os.makedirs(output_folder, exist_ok=True)

	# Regular expression to find the model name
	model_pattern = re.compile(r"pretrained=([\w\-./]+)")

	# Scans files in the input folder
	for filename in os.listdir(input_folder):
	if filename.endswith('.out'):
	file_path = os.path.join(input_folder, filename)

	# Reads the file content
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()

	# Extracts the model name
	match = model_pattern.search(content)
	if match:
	model_name = match.group(1)
	print(f"Processing model: {model_name}")

	try:
	# Retrieves model information from HuggingFace
	model_info = api.model_info(model_name)

	# Calculates the number of parameters in billions, if available
	num_params = None
	if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
	num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions

	# Extracts and concatenates languages
	language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""

	#print(model_info)

	# Builds the dictionary with required metadata
	model_data = {
	"model": model_name,
	"base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
	"revision": model_info.sha,
	# "precision": "bfloat16", # If available, replace with real value
	# "weight_type": "Original",
	# "status": "FINISHED",
	"submitted_time": str(model_info.created_at),
	# "model_type": "pretrained",
	# "likes": model_info.likes,
	# "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
	# "license": model_info.license,
	# "private": model_info.private,
	"num_params_billion": num_params, # Number of parameters in billions
	"language": language, # Extracted language
	}

	# Separates the model_name into two parts: directory name and file name
	if "/" in model_name:
	dir_name, file_name = model_name.split("/", 1)
	else:
	dir_name, file_name = model_name, model_name # If no "/", use the same name

	# Creates the folder for saving the produced json files
	model_output_folder = os.path.join(output_folder, dir_name)
	os.makedirs(model_output_folder, exist_ok=True)

	# Saves the JSON file in the appropriate folder
	output_file = os.path.join(model_output_folder, f"{file_name}.json")

	# Check if the file already exists
	if os.path.exists(output_file):
	print(f"File {output_file} already exists. Skipping...")
	continue

	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(model_data, f, indent=4)

	print(f"Saved metadata for {model_name} in {output_file}")

	except Exception as e:
	print(f"Error retrieving info for {model_name}: {e}")

	print("Process finished!")