Spaces:
Running
Running
| import argparse | |
| import huggingface_hub | |
| def model_has_dataset(model): | |
| for tag in model.tags: | |
| if tag.startswith("dataset:"): | |
| return True | |
| return False | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| prog="Giskard Retriever", description="Retrieves HF models that are bound to datasets." | |
| ) | |
| parser.add_argument( | |
| "--model_type", | |
| help="Hugging Face model types. default: text-classification", | |
| required=False, | |
| ) | |
| parser.add_argument("--output_format", | |
| help="Format of the information retrieved. Default: parquet. Options: parquet, csv, json.") | |
| args = parser.parse_args() | |
| MODEL_TYPE = args.model_type if args.model_type is not None else "text-classification" | |
| models_with_dataset = filter( | |
| model_has_dataset, huggingface_hub.list_models(filter=MODEL_TYPE, sort="likes", direction=-1) | |
| ) | |
| import pandas as pd | |
| df = pd.DataFrame( | |
| [ | |
| { | |
| "modelId": m.modelId, | |
| "modelType": MODEL_TYPE, | |
| "author": m.author, | |
| "downloads": m.downloads, | |
| "likes": m.likes, | |
| "datasets": [t[8:] for t in m.tags if t.startswith("dataset:")], | |
| } | |
| for m in models_with_dataset | |
| ] | |
| ) | |
| output_format = args.output_format | |
| if output_format is None or output_format == "parquet": | |
| df.to_parquet(f"models_{MODEL_TYPE}.parquet", index=False) | |
| elif output_format == "csv": | |
| df.to_csv(f"models_{MODEL_TYPE}.csv", columns=df.columns, index=False) | |
| elif output_format == "json": | |
| df.to_json(f"models_{MODEL_TYPE}.json", index=False) | |