def load_df()

in method_comparison/processing.py [0:0]


def load_df(path, task_name, print_fn=print):
    jsons = load_jsons(path)
    preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn)
    dtype_dict = {
        "task_name": "string",
        "experiment_name": "string",
        "model_id": "string",
        "train_config": "string",
        "peft_type": "string",
        "peft_config": "string",
        "cuda_memory_reserved_avg": int,
        "cuda_memory_max": int,
        "cuda_memory_reserved_99th": int,
        "total_time": float,
        "train_time": float,
        "file_size": int,
        "test_accuracy": float,
        "train_loss": float,
        "train_samples": int,
        "train_total_tokens": int,
        "peft_version": "string",
        "peft_branch": "string",
        "transformers_version": "string",
        "datasets_version": "string",
        "torch_version": "string",
        "bitsandbytes_version": "string",
        "package_info": "string",
        "system_info": "string",
        "created_at": "string",
    }
    df = pd.DataFrame(preprocessed)
    df = df.astype(dtype_dict)
    df["created_at"] = pd.to_datetime(df["created_at"])
    # round training time to nearest second
    df["train_time"] = df["train_time"].round().astype(int)
    df["total_time"] = df["total_time"].round().astype(int)

    # reorder columns for better viewing, pinned_columns arg in Gradio seems not to work correctly
    important_columns = [
        "experiment_name",
        "peft_type",
        "total_time",
        "train_time",
        "test_accuracy",
        "train_loss",
        "cuda_memory_max",
        "cuda_memory_reserved_99th",
        "cuda_memory_reserved_avg",
        "file_size",
        "created_at",
        "task_name",
    ]
    other_columns = [col for col in df if col not in important_columns]
    df = df[important_columns + other_columns]

    size_before_drop_dups = len(df)
    columns = ["experiment_name", "model_id", "peft_type", "created_at"]
    # we want to keep only the most recent run for each experiment
    df = df.sort_values("created_at").drop_duplicates(columns, keep="last")
    return df