Spaces:

evaleval
/

every_eval_ever_space

Running

File size: 12,278 Bytes

a92080e

{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "version": "0.0.1",
    "type": "object",
    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
    "required": [
        "schema_version",
        "evaluation_id",
        "evaluation_source",
        "retrieved_timestamp",
        "source_data",
        "source_metadata",
        "model_info",
        "evaluation_results"
    ],
    "properties": {
        "schema_version": {
            "type": "string",
            "description": "Version of the schema used for this evaluation data"
        },
        "evaluation_id": {
            "type": "string",
            "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
        },
        "retrieved_timestamp": {
            "type": "string",
            "description": "Timestamp for when this record was created"
        },
        "source_data": {
            "type": "array",
            "description": "URLs for the source of the evaluation data",
            "items": {
                "type": "string"
            }
        },
        "evaluation_source": {
            "type": "object",
            "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
            "required": [
                "evaluation_source_name",
                "evaluation_source_type"
            ],
            "properties": {
                "evaluation_source_name": {
                    "type": "string",
                    "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
                },
                "evaluation_source_type": {
                    "type": "string",
                    "enum": [
                        "leaderboard", 
                        "evaluation_platform"
                    ],
                    "description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
                }
            }
        },
        "source_metadata": {
            "type": "object",
            "description": "Metadata about the source of the leaderboard data",
            "required": [
                "source_organization_name",
                "evaluator_relationship"
            ],
            "properties": {
                "source_organization_name": {
                    "type": "string",
                    "description": "Name of the organization that provides the data"
                },
                "source_organization_url": {
                    "type": "string",
                    "description": "URL for the organization that provides the data"
                },
                "source_organization_logo_url": {
                    "type": "string",
                    "description": "URL for the Logo for the organization that provides the data"
                },
                "evaluator_relationship": {
                    "type": "string",
                    "description": "Relationship between the evaluator and the model",
                    "enum": [
                        "first_party",
                        "third_party",
                        "collaborative",
                        "other"
                    ]
                }
            }
        },
        "model_info": {
            "type": "object",
            "description": "Complete model specification including basic information, technical configuration and inference settings",
            "required": [
                "name",
                "id"
            ],
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Model name provided by evaluation source"
                },
                "id": {
                    "type": "string",
                    "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
                },
                "developer": {
                    "type": "string",
                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
                },
                "inference_platform": {
                    "type": "string",
                    "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
                }
            }
        },
        "evaluation_results": {
            "type": "array",
            "description": "Array of evaluation results",
            "items": {
                "type": "object",
                "required": [
                    "evaluation_name",
                    "metric_config",
                    "score_details"
                ],
                "properties": {
                    "evaluation_name": {
                        "type": "string",
                        "description": "Name of the evaluation"
                    },
                    "evaluation_timestamp": {
                        "type": "string",
                        "description": "Timestamp for when the evaluations were run"
                    },
                    "metric_config": {
                        "type": "object",
                        "description": "Details about the metric",
                        "required": [
                            "lower_is_better"
                        ],
                        "properties": {
                            "evaluation_description": {
                                "type": "string",
                                "description": "Description of the evaluation"
                            },
                            "lower_is_better": {
                                "type": "boolean",
                                "description": "Whether a lower score is better"
                            },
                            "score_type": {
                                "type": "string",
                                "description": "Type of score",
                                "enum": [
                                    "binary",
                                    "continuous",
                                    "levels"
                                ]
                            },
                            "level_names": {
                                "type": "array",
                                "description": "Names of the score levels",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "level_metadata": {
                                "type": "array",
                                "description": "Additional Description for each Score Level",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "has_unknown_level": {
                                "type": "boolean",
                                "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
                            },
                            "min_score": {
                                "type": "number",
                                "description": "Minimum possible score for continuous metric"
                            },
                            "max_score": {
                                "type": "number",
                                "description": "Maximum possible score for continuous metric"
                            }
                        },
                        "if": {
                            "properties": {
                                "score_type": {
                                    "const": "levels"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "level_names",
                                "has_unknown_level"
                            ]
                        },
                        "else": {
                            "if": {
                                "properties": {
                                    "score_type": {
                                        "const": "continuous"
                                    }
                                }
                            },
                            "then": {
                                "required": [
                                    "min_score",
                                    "max_score"
                                ]
                            }
                        }
                    },
                    "score_details": {
                        "type": "object",
                        "description": "The score for the evaluation and related details",
                        "required": [
                            "score"
                        ],
                        "properties": {
                            "score": {
                                "type": "number",
                                "description": "The score for the evaluation"
                            },
                            "details": {
                                "type": "object",
                                "description": "Any additional details about the score",
                                "additionalProperties": true
                            }
                        }
                    },
                    "detailed_evaluation_results_url": {
                        "type": "string",
                        "description": "Link to detailed evaluation data"
                    },
                    "generation_config": {
                        "type": "object",
                        "generation_args": {
                                "type": "object",
                                "description": "Parameters used to generate results - properties may vary by model type",
                                "properties": {
                                    "temperature": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Sampling temperature"
                                    },
                                    "top_p": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Nucleus sampling parameter"
                                    },
                                    "top_k": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Top-k sampling parameter"
                                    },
                                    "max_tokens": {
                                        "type": "integer",
                                        "minimum": 1,
                                        "description": "Maximum number of tokens to generate"
                                    }
                                },
                                "additionalProperties": true
                        },
                        "additional_details": {
                            "type": "string",
                            "description": "Additional details about how the results for this metric were generated."
                        }
                    }
                }
            }

        }
    }
}