in evals/cli/oaieval.py [0:0]
def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> str:
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
visible = args.visible if args.visible is not None else (args.max_samples is None)
if args.max_samples is not None:
evals.eval.set_max_samples(args.max_samples)
registry = registry or Registry()
if args.registry_path:
registry.add_registry_paths(args.registry_path)
eval_spec = registry.get_eval(args.eval)
assert (
eval_spec is not None
), f"Eval {args.eval} not found. Available: {list(sorted(registry._evals.keys()))}"
def parse_extra_eval_params(
param_str: Optional[str],
) -> Mapping[str, Union[str, int, float]]:
"""Parse a string of the form "key1=value1,key2=value2" into a dict."""
if not param_str:
return {}
def to_number(x: str) -> Union[int, float, str]:
try:
return int(x)
except (ValueError, TypeError):
pass
try:
return float(x)
except (ValueError, TypeError):
pass
return x
str_dict = dict(kv.split("=") for kv in param_str.split(","))
return {k: to_number(v) for k, v in str_dict.items()}
extra_eval_params = parse_extra_eval_params(args.extra_eval_params)
if eval_spec.args is None:
eval_spec.args = extra_eval_params
else:
eval_spec.args.update(extra_eval_params)
# If the user provided an argument to --completion_args, parse it into a dict here, to be passed to the completion_fn creation **kwargs
completion_args = args.completion_args.split(",")
additional_completion_args = {k: v for k, v in (kv.split("=") for kv in completion_args if kv)}
completion_fns = args.completion_fn.split(",")
completion_fn_instances = [
registry.make_completion_fn(url, **additional_completion_args) for url in completion_fns
]
run_config = {
"completion_fns": completion_fns,
"eval_spec": eval_spec,
"seed": args.seed,
"max_samples": args.max_samples,
"command": " ".join(map(shlex.quote, sys.argv)),
"initial_settings": {
"visible": visible,
},
}
eval_name = eval_spec.key
if eval_name is None:
raise Exception("you must provide a eval name")
run_spec = evals.base.RunSpec(
completion_fns=completion_fns,
eval_name=eval_name,
base_eval=eval_name.split(".")[0],
split=eval_name.split(".")[1],
run_config=run_config,
created_by=args.user,
)
record_path = (
f"/tmp/evallogs/{run_spec.run_id}_{args.completion_fn}_{args.eval}.jsonl"
if args.record_path is None
else args.record_path
)
if args.http_run:
args.local_run = False
elif args.local_run:
args.http_run = False
recorder = build_recorder(args, run_spec, record_path)
api_extra_options: dict[str, Any] = {}
if not args.cache:
api_extra_options["cache_level"] = 0
run_url = f"{run_spec.run_id}"
logger.info(_purple(f"Run started: {run_url}"))
eval_class = registry.get_class(eval_spec)
eval: Eval = eval_class(
completion_fns=completion_fn_instances,
seed=args.seed,
name=eval_name,
eval_registry_path=eval_spec.registry_path,
registry=registry,
**extra_eval_params,
)
result = eval.run(recorder)
try:
add_token_usage_to_result(result, recorder)
except Exception as e:
logger.error(f"Failed to add token usage to result: {e}. Eval results will be reported and are not affected.")
recorder.record_final_report(result)
if not (args.dry_run or args.local_run):
logger.info(_purple(f"Run completed: {run_url}"))
logger.info("Final report:")
for key, value in result.items():
logger.info(f"{key}: {value}")
return run_spec.run_id