in scripts/log_lighteval_to_wandb.py [0:0]
def run(current_path: Path):
def compute_avg_acc_of_a_benchmark(data, benchmark_prefix):
sum_acc, sum_acc_norm, sum_acc_stderr, sum_acc_norm_stderr, count = 0, 0, 0, 0, 0
for key, values in data.items():
if f"{benchmark_prefix}:" in key:
sum_acc += values["acc"]
sum_acc_norm += values["acc_norm"]
sum_acc_stderr += values["acc_stderr"]
sum_acc_norm_stderr += values["acc_norm_stderr"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
def compute_avg_acc_of_all_tasks(data):
sum_acc, count = 0, 0
for _, values in data.items():
sum_acc += values["acc"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
list_checkpoints = os.listdir(current_path)
sorted_list_checkpoints = sorted(list_checkpoints, key=int)
for item in sorted_list_checkpoints:
item_path = os.path.join(current_path, item)
if os.path.isdir(item_path):
json_files = [f for f in os.listdir(item_path) if f.endswith(".json")]
if len(json_files) == 1:
json_file_path = os.path.join(item_path, json_files[0])
with open(json_file_path, "r") as file:
eval_data = json.load(file)
iteration_step = eval_data["config_general"]["config"]["general"]["step"]
consumed_train_samples = eval_data["config_general"]["config"]["general"]["consumed_train_samples"]
logging_results = {}
for name, data in eval_data["results"].items():
logging_results[f"{name}_acc"] = data["acc"]
logging_results["mmlu:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "mmlu")
logging_results["arc:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "arc")
logging_results["all:average_acc"] = compute_avg_acc_of_all_tasks(eval_data["results"])
wandb.log(
{
**logging_results,
"iteration_step": iteration_step,
"consumed_train_samples": consumed_train_samples,
}
)
elif len(json_files) > 1:
print(f"More than one JSON file found in {item_path}. Skipping.")
else:
print(f"No JSON file found in {item_path}.")
print(f"Checkpoint {item} is done. /n")