in method_comparison/processing.py [0:0]
def load_df(path, task_name, print_fn=print):
jsons = load_jsons(path)
preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn)
dtype_dict = {
"task_name": "string",
"experiment_name": "string",
"model_id": "string",
"train_config": "string",
"peft_type": "string",
"peft_config": "string",
"cuda_memory_reserved_avg": int,
"cuda_memory_max": int,
"cuda_memory_reserved_99th": int,
"total_time": float,
"train_time": float,
"file_size": int,
"test_accuracy": float,
"train_loss": float,
"train_samples": int,
"train_total_tokens": int,
"peft_version": "string",
"peft_branch": "string",
"transformers_version": "string",
"datasets_version": "string",
"torch_version": "string",
"bitsandbytes_version": "string",
"package_info": "string",
"system_info": "string",
"created_at": "string",
}
df = pd.DataFrame(preprocessed)
df = df.astype(dtype_dict)
df["created_at"] = pd.to_datetime(df["created_at"])
# round training time to nearest second
df["train_time"] = df["train_time"].round().astype(int)
df["total_time"] = df["total_time"].round().astype(int)
# reorder columns for better viewing, pinned_columns arg in Gradio seems not to work correctly
important_columns = [
"experiment_name",
"peft_type",
"total_time",
"train_time",
"test_accuracy",
"train_loss",
"cuda_memory_max",
"cuda_memory_reserved_99th",
"cuda_memory_reserved_avg",
"file_size",
"created_at",
"task_name",
]
other_columns = [col for col in df if col not in important_columns]
df = df[important_columns + other_columns]
size_before_drop_dups = len(df)
columns = ["experiment_name", "model_id", "peft_type", "created_at"]
# we want to keep only the most recent run for each experiment
df = df.sort_values("created_at").drop_duplicates(columns, keep="last")
return df