in lm_eval/loggers/evaluation_tracker.py [0:0]
def recreate_metadata_card(self) -> None:
"""
Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
"""
eval_logger.info("Recreating metadata card")
repo_id = self.details_repo if self.public_repo else self.details_repo_private
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo)
sample_files = get_sample_results_filenames(files_in_repo)
# Build a dictionary to store the latest evaluation datetime for:
# - Each tested model and its aggregated results
# - Each task and sample results, if existing
# i.e. {
# "org__model_name__gsm8k": "2021-09-01T12:00:00",
# "org__model_name__ifeval": "2021-09-01T12:00:00",
# "org__model_name__results": "2021-09-01T12:00:00"
# }
latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
results_datetime = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
# Results and sample results for the same model and task will have the same datetime
samples_key = f"{model_name}__{task_name_sanitized}"
results_key = f"{model_name}__results"
latest_datetime = max(
latest_task_results_datetime[samples_key],
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card
card_metadata = MetadataConfigs()
# Add the latest aggregated results to the metadata card for easy access
for file_path in results_files:
file_path = Path(file_path)
results_filename = file_path.name
model_name = file_path.parent
eval_date = get_file_datetime(results_filename)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(results_filename).name
config_name = f"{model_name}__results"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Add the tasks details configs
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
eval_date = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(filename).name
config_name = f"{model_name}__{task_name_sanitized}"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max(
latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
)
last_results_file = [
f for f in results_files if latest_datetime.replace(":", "-") in f
][0]
last_results_file_path = hf_hub_url(
repo_id=repo_id, filename=last_results_file, repo_type="dataset"
)
latest_results_file = load_dataset(
"json", data_files=last_results_file_path, split="train"
)
results_dict = latest_results_file["results"][0]
new_dictionary = {"all": results_dict}
new_dictionary.update(results_dict)
results_string = json.dumps(new_dictionary, indent=4)
dataset_summary = (
"Dataset automatically created during the evaluation run of model "
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
else:
dataset_summary += f"{self.general_config_tracker.model_name}\n"
dataset_summary += (
f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
'An additional configuration "results" store all the aggregated results of the run.\n\n'
"To load the details from a run, you can for instance do the following:\n"
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += (
"```python\nfrom datasets import load_dataset\n"
f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
)
dataset_summary += (
"## Latest results\n\n"
f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
"(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
'You find each in the results and the "latest" split for each eval):\n\n'
f"```python\n{results_string}\n```"
)
card_data = DatasetCardData(
dataset_summary=dataset_summary,
repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
leaderboard_url=self.leaderboard_url,
point_of_contact=self.point_of_contact,
)
card_metadata.to_dataset_card_data(card_data)
card = DatasetCard.from_template(
card_data,
pretty_name=card_data.pretty_name,
)
card.push_to_hub(repo_id, repo_type="dataset")