llm_perf/common/get_top_model_from_hub.py (70 lines of code) (raw):

import json import os from collections import defaultdict from typing import Dict, List import requests from datasets import Dataset def get_top_text_generation_models( n: int, sort: str = "downloads", direction: int = -1 ) -> List[Dict]: base_url = "https://huggingface.co/api/models" params = { "sort": sort, "direction": direction, "limit": n, "filter": "text-generation", "full": "false", } headers = {} huggingface_token = os.environ.get("HUGGINGFACE_TOKEN") if huggingface_token: headers["Authorization"] = f"Bearer {huggingface_token}" response = requests.get(base_url, params=params, headers=headers) response.raise_for_status() # Raise an exception for bad responses models = response.json() return [ { "organization": model["id"].split("/")[0], "model_name": model["id"].split("/")[-1], "downloads": model.get("downloads", 0), } for model in models if "downloads" in model ] def save_to_json(data: List[Dict], filename: str): with open(filename, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"Data saved to {filename}") def compute_org_downloads(models: List[Dict]) -> Dict[str, int]: org_downloads = defaultdict(int) for model in models: org_downloads[model["organization"]] += model["downloads"] return dict(org_downloads) def upload_to_hf_dataset(data: List[Dict], dataset_name: str): dataset = Dataset.from_list(data) dataset.push_to_hub(dataset_name) print(f"Data uploaded to Hugging Face dataset: {dataset_name}") def main(): # Set up authentication (optional, but recommended) huggingface_token = os.environ.get("HUGGINGFACE_TOKEN") if huggingface_token: os.environ["HUGGINGFACE_HUB_TOKEN"] = huggingface_token else: print( "Warning: HUGGINGFACE_TOKEN not found in environment variables. Running without authentication." ) n = 100 top_models = get_top_text_generation_models(n) print(f"\nTop {n} text generation models on Hugging Face Hub:") for i, model in enumerate(top_models, 1): print( f"{i}. {model['organization']}/{model['model_name']}: {model['downloads']:,} downloads" ) # Upload to Hugging Face dataset dataset_name = "optimum-benchmark/top-text-generation-models" upload_to_hf_dataset(top_models, dataset_name) # Display top 10 organizations by downloads print("\nTop 10 organizations by total downloads:") org_downloads = compute_org_downloads(top_models) sorted_orgs = sorted(org_downloads.items(), key=lambda x: x[1], reverse=True)[:10] for i, (org, downloads) in enumerate(sorted_orgs, 1): print(f"{i}. {org}: {downloads:,} downloads") if __name__ == "__main__": main()