def eval_models()

in scripts/regression.py [0:0]


def eval_models(args, branch=None):
    if branch is not None:
        if os.system(f"git checkout {branch}") != 0:
            return {}, 0

    branch = branch or initial_branch

    start_time = time.time()

    results = {}

    for model in args.models:
        model_type = (
            "hf-causal"
            if model in causal_models
            else "hf-seq2seq"
            if model in seq2seq_models
            else args.model
        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
        tasks = (
            args.tasks
            if model in causal_models or model_type == "hf-causal"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
        batch_size = (
            args.batch_size
            if model in causal_models or model_type == "hf-causal"
            else 64
            if args.batch_size == "auto"
            else args.batch_size
        )
        output_path = (
            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
        )

        command = (
            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
        )

        print(
            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
        )

        ret = os.system(command)

        results[model] = (
            json.load(open(output_path, encoding="utf-8"))
            if ret == 0
            else {"results": {}}
        )

    end_time = time.time()

    return results, end_time - start_time