def run()

in evals/cli/oaieval.py [0:0]
98 lines of code
25 McCabe index (conditional complexity)

def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> str:
    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)

    visible = args.visible if args.visible is not None else (args.max_samples is None)

    if args.max_samples is not None:
        evals.eval.set_max_samples(args.max_samples)

    registry = registry or Registry()
    if args.registry_path:
        registry.add_registry_paths(args.registry_path)

    eval_spec = registry.get_eval(args.eval)
    assert (
        eval_spec is not None
    ), f"Eval {args.eval} not found. Available: {list(sorted(registry._evals.keys()))}"

    def parse_extra_eval_params(
        param_str: Optional[str],
    ) -> Mapping[str, Union[str, int, float]]:
        """Parse a string of the form "key1=value1,key2=value2" into a dict."""
        if not param_str:
            return {}

        def to_number(x: str) -> Union[int, float, str]:
            try:
                return int(x)
            except (ValueError, TypeError):
                pass
            try:
                return float(x)
            except (ValueError, TypeError):
                pass
            return x

        str_dict = dict(kv.split("=") for kv in param_str.split(","))
        return {k: to_number(v) for k, v in str_dict.items()}

    extra_eval_params = parse_extra_eval_params(args.extra_eval_params)

    if eval_spec.args is None:
        eval_spec.args = extra_eval_params
    else:
        eval_spec.args.update(extra_eval_params)

    # If the user provided an argument to --completion_args, parse it into a dict here, to be passed to the completion_fn creation **kwargs
    completion_args = args.completion_args.split(",")
    additional_completion_args = {k: v for k, v in (kv.split("=") for kv in completion_args if kv)}

    completion_fns = args.completion_fn.split(",")
    completion_fn_instances = [
        registry.make_completion_fn(url, **additional_completion_args) for url in completion_fns
    ]

    run_config = {
        "completion_fns": completion_fns,
        "eval_spec": eval_spec,
        "seed": args.seed,
        "max_samples": args.max_samples,
        "command": " ".join(map(shlex.quote, sys.argv)),
        "initial_settings": {
            "visible": visible,
        },
    }

    eval_name = eval_spec.key
    if eval_name is None:
        raise Exception("you must provide a eval name")

    run_spec = evals.base.RunSpec(
        completion_fns=completion_fns,
        eval_name=eval_name,
        base_eval=eval_name.split(".")[0],
        split=eval_name.split(".")[1],
        run_config=run_config,
        created_by=args.user,
    )

    record_path = (
        f"/tmp/evallogs/{run_spec.run_id}_{args.completion_fn}_{args.eval}.jsonl"
        if args.record_path is None
        else args.record_path
    )

    if args.http_run:
        args.local_run = False
    elif args.local_run:
        args.http_run = False

    recorder = build_recorder(args, run_spec, record_path)

    api_extra_options: dict[str, Any] = {}
    if not args.cache:
        api_extra_options["cache_level"] = 0

    run_url = f"{run_spec.run_id}"
    logger.info(_purple(f"Run started: {run_url}"))

    eval_class = registry.get_class(eval_spec)
    eval: Eval = eval_class(
        completion_fns=completion_fn_instances,
        seed=args.seed,
        name=eval_name,
        eval_registry_path=eval_spec.registry_path,
        registry=registry,
        **extra_eval_params,
    )
    result = eval.run(recorder)
    try:
        add_token_usage_to_result(result, recorder)
    except Exception as e:
        logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.")
    recorder.record_final_report(result)

    if not (args.dry_run or args.local_run):
        logger.info(_purple(f"Run completed: {run_url}"))

    logger.info("Final report:")
    for key, value in result.items():
        logger.info(f"{key}: {value}")
    return run_spec.run_id