def preprocess()

in method_comparison/processing.py [0:0]


def preprocess(rows, task_name: str, print_fn=print):
    results = []
    skipped = 0
    for row in rows:
        run_info = row["run_info"]
        train_info = row["train_info"]
        meta_info = row["meta_info"]
        if run_info["peft_config"]:
            peft_type = run_info["peft_config"]["peft_type"]
        else:
            peft_type = "full-finetuning"
        if train_info["status"] != "success":
            skipped += 1
            continue

        train_metrics = train_info["metrics"][-1]

        # extract the fields that make most sense
        dct = {
            "task_name": task_name,
            "experiment_name": run_info["experiment_name"],
            "model_id": run_info["train_config"]["model_id"],
            "train_config": run_info["train_config"],
            "peft_type": peft_type,
            "peft_config": run_info["peft_config"],
            "cuda_memory_reserved_avg": train_info["cuda_memory_reserved_avg"],
            "cuda_memory_max": train_info["cuda_memory_max"],
            "cuda_memory_reserved_99th": train_info["cuda_memory_reserved_99th"],
            "total_time": run_info["total_time"],
            "train_time": train_info["train_time"],
            "file_size": train_info["file_size"],
            "test_accuracy": train_metrics["test accuracy"],
            "train_loss": train_metrics["train loss"],
            "train_samples": train_metrics["train samples"],
            "train_total_tokens": train_metrics["train total tokens"],
            "peft_version": meta_info["package_info"]["peft-version"],
            "peft_branch": run_info["peft_branch"],
            "transformers_version": meta_info["package_info"]["transformers-version"],
            "datasets_version": meta_info["package_info"]["datasets-version"],
            "torch_version": meta_info["package_info"]["torch-version"],
            "bitsandbytes_version": meta_info["package_info"]["bitsandbytes-version"],
            "package_info": meta_info["package_info"],
            "system_info": meta_info["system_info"],
            "created_at": run_info["created_at"],
        }
        results.append(dct)

    if skipped:
        print_fn(f"Skipped {skipped} of {len(rows)} entries because the train status != success")

    return results