Spaces:
Sleeping
Sleeping
| """ | |
| MODEL METADATA EXTRACTOR | |
| This script processes model evaluation output files (input_folder) from the lm-eval-harness library, | |
| extracts model identifiers, retrieves detailed metadata from HuggingFace | |
| and saves the information as structured JSON files (output_folder). | |
| Input: Directory containing .out files from lm-eval-harness | |
| Output: Directory with JSON files containing model metadata | |
| """ | |
| # Example input file format (lm-eval-harness output): | |
| ''' | |
| hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1 | |
| | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| | |
| |------------------------|------:|------|-----:|--------|---|-----:|---|------| | |
| |evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052| | |
| ... | |
| Job completed | |
| ''' | |
| # Example output JSON format: | |
| ''' | |
| { | |
| "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", | |
| "base_model": "LlamaForCausalLM", | |
| "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66", | |
| "submitted_time": "2024-04-29 09:34:12+00:00", | |
| "num_params_billion": 8.030261248, | |
| "language": "en_it" | |
| } | |
| ''' | |
| import os | |
| import re | |
| import json | |
| from huggingface_hub import HfApi | |
| # Configures the Hugging Face token (if needed) | |
| # TOKEN = "YOUR_HUGGINGFACE_API_TOKEN" | |
| api = HfApi() | |
| # Directory paths | |
| # input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics. | |
| input_folder = "../evalita_llm_models_output/" | |
| # output_folder: Directory where JSON files with model characteristics will be saved. | |
| output_folder = "../evalita_llm_requests/" | |
| # Creates the output folder if it doesn't exist | |
| os.makedirs(output_folder, exist_ok=True) | |
| # Regular expression to find the model name | |
| model_pattern = re.compile(r"pretrained=([\w\-./]+)") | |
| # Scans files in the input folder | |
| for filename in os.listdir(input_folder): | |
| if filename.endswith('.out'): | |
| file_path = os.path.join(input_folder, filename) | |
| # Reads the file content | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| # Extracts the model name | |
| match = model_pattern.search(content) | |
| if match: | |
| model_name = match.group(1) | |
| print(f"Processing model: {model_name}") | |
| try: | |
| # Retrieves model information from HuggingFace | |
| model_info = api.model_info(model_name) | |
| # Calculates the number of parameters in billions, if available | |
| num_params = None | |
| if model_info.safetensors and "BF16" in model_info.safetensors.parameters: | |
| num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions | |
| # Extracts and concatenates languages | |
| language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else "" | |
| #print(model_info) | |
| # Builds the dictionary with required metadata | |
| model_data = { | |
| "model": model_name, | |
| "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "", | |
| "revision": model_info.sha, | |
| # "precision": "bfloat16", # If available, replace with real value | |
| # "weight_type": "Original", | |
| # "status": "FINISHED", | |
| "submitted_time": str(model_info.created_at), | |
| # "model_type": "pretrained", | |
| # "likes": model_info.likes, | |
| # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None, | |
| # "license": model_info.license, | |
| # "private": model_info.private, | |
| "num_params_billion": num_params, # Number of parameters in billions | |
| "language": language, # Extracted language | |
| } | |
| # Separates the model_name into two parts: directory name and file name | |
| if "/" in model_name: | |
| dir_name, file_name = model_name.split("/", 1) | |
| else: | |
| dir_name, file_name = model_name, model_name # If no "/", use the same name | |
| # Creates the folder for saving the produced json files | |
| model_output_folder = os.path.join(output_folder, dir_name) | |
| os.makedirs(model_output_folder, exist_ok=True) | |
| # Saves the JSON file in the appropriate folder | |
| output_file = os.path.join(model_output_folder, f"{file_name}.json") | |
| # Check if the file already exists | |
| if os.path.exists(output_file): | |
| print(f"File {output_file} already exists. Skipping...") | |
| continue | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(model_data, f, indent=4) | |
| print(f"Saved metadata for {model_name} in {output_file}") | |
| except Exception as e: | |
| print(f"Error retrieving info for {model_name}: {e}") | |
| print("Process finished!") |