src/dataset/leaderboard_dataset.py (52 lines of code) (raw):
from pydantic import BaseModel
from typing import Optional
from datasets import load_dataset, Dataset
import pandas as pd
from datetime import datetime
from loguru import logger
DATASET_NAME = "hf-ai-hardware/ai-hardware-leaderboard"
class LeaderboardData(BaseModel):
model_id: str
backend_type: str
hardware_type: str
machine: str
can_serve_single_request: bool
docker_command: Optional[str]
benchmark_time: datetime
def upload_data_to_hub(results: list[LeaderboardData]):
"""
Upload the results to the hub.
"""
# Convert results to list of dicts
results_dict = [result.model_dump() for result in results]
logger.debug(f"Results: {results_dict}")
try:
# Try to download existing dataset
leaderboard_dataset = load_dataset(DATASET_NAME)
df: pd.DataFrame = leaderboard_dataset.to_pandas() # type: ignore
except Exception:
# If dataset doesn't exist, start with empty DataFrame
df = pd.DataFrame(columns=["model_id", "backend_type", "hardware_type", "can_serve_single_request", "machine"])
# Create DataFrame from new results
new_df: pd.DataFrame = pd.DataFrame(results_dict)
# Keep track of updated and added entries for commit message
updated_entries: list[str] = []
added_entries: list[str] = []
# Update existing entries and add new ones
# We use model_id and backend_type as unique identifiers
for _, row in new_df.iterrows():
row_dict = row.to_dict()
if row_dict["machine"] == "unknown":
# do not upload unknown machines to the leaderboard
continue
mask = (df["model_id"] == row_dict["model_id"]) & (df["backend_type"] == row_dict["backend_type"]) & (df["machine"] == row_dict["machine"])
entry_desc = f"{row_dict['model_id']} ({row_dict['backend_type']}, {row_dict['machine']})"
if mask.any():
# Update existing entry
row_series = pd.Series(row_dict)
df.loc[mask] = row_series
updated_entries.append(entry_desc)
else:
# Add new entry
new_row_df = pd.DataFrame([row_dict])
df = pd.concat([df, new_row_df], ignore_index=True)
added_entries.append(entry_desc)
# Create commit message
commit_parts = []
if updated_entries:
commit_parts.append(f"Updated entries: {', '.join(updated_entries)}")
if added_entries:
commit_parts.append(f"Added entries: {', '.join(added_entries)}")
commit_message = " | ".join(commit_parts)
# if there is no commit message, we don't need to upload as we didn't change anything
if not commit_message:
logger.info("No changes to upload")
return
# Convert back to Dataset
dataset = Dataset.from_pandas(df)
# Push to hub with commit message
dataset.push_to_hub(DATASET_NAME, commit_message=commit_message)
logger.info(f"Successfully uploaded results to the leaderboard")