in evaluation.py [0:0]
def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
selected_task_list = args.tasks.split(",") if args.tasks else None
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model)
# update the evaluation tracker args with the output path and the HF token
if args.output_path:
args.hf_hub_log_args += f",output_path={args.output_path}"
if os.environ.get("HF_TOKEN", None):
args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
eval_logger.info(f"Evaluation tracker args: {evaluation_tracker_args}")
evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
if args.predict_only:
args.log_samples = True
if (args.log_samples or args.predict_only) and not args.output_path:
raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
if args.fewshot_as_multiturn and args.apply_chat_template is False:
raise ValueError("If fewshot_as_multiturn is set, apply_chat_template must be set to True.")
if (args.num_fewshot is None or args.num_fewshot == 0) and args.fewshot_as_multiturn:
raise ValueError("If fewshot_as_multiturn is set, num_fewshot must be greater than 0.")
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.")
if args.limit:
eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if os.environ.get("LMMS_EVAL_PLUGINS", None):
args.include_path = [args.include_path] if args.include_path else []
for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
package_tasks_location = importlib.util.find_spec(f"{plugin}.tasks").submodule_search_locations[0]
args.include_path.append(package_tasks_location)
if args.tasks is None:
eval_logger.error("Need to specify task to evaluate.")
sys.exit()
elif args.tasks == "list":
eval_logger.info("Available Tasks:\n - {}".format(f"\n - ".join(sorted(task_manager.all_tasks))))
sys.exit()
elif args.tasks == "list_groups":
eval_logger.info(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
sys.exit()
elif args.tasks == "list_tags":
eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
sys.exit()
elif args.tasks == "list_subtasks":
eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False))
sys.exit()
elif args.tasks == "list_with_num":
log_message = (
"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
)
eval_logger.info(log_message)
for task_name in sorted(task_manager.list_all_tasks()):
try:
task_dict = get_task_dict([task_name], model_name="llava")
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
group, task_obj = task_obj
if task_obj is None:
continue
eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
except Exception as e:
eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
sys.exit()
else:
if os.path.isdir(args.tasks):
import glob
task_names = []
yaml_path = os.path.join(args.tasks, "*.yaml")
for yaml_file in glob.glob(yaml_path):
config = utils.load_yaml_config(yaml_file)
task_names.append(config)
else:
task_list = args.tasks.split(",")
task_names = task_manager.match_tasks(task_list)
for task in [task for task in task_list if task not in task_names]:
if os.path.isfile(task):
config = utils.load_yaml_config(task)
task_names.append(config)
task_missing = [task for task in task_list if task not in task_names and "*" not in task] # we don't want errors if a wildcard ("*") task name was used
if task_missing:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n" f"{utils.SPACING}Try `lmms-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks not found: {missing}. Try `lmms-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
)
eval_logger.info(f"Selected Tasks: {task_names}")
request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)
datetime_str = utils.get_datetime_str(timezone=args.timezone)
wrapped_model = NanoVLMWrapper(
model=args.model,
device=args.device,
batch_size=int(args.batch_size),
)
results = evaluator.simple_evaluate(
model=wrapped_model,
model_args=args.model_args,
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device,
use_cache=args.use_cache,
limit=args.limit,
check_integrity=args.check_integrity,
write_out=args.write_out,
log_samples=args.log_samples,
evaluation_tracker=evaluation_tracker,
system_instruction=args.system_instruction,
apply_chat_template=args.apply_chat_template,
fewshot_as_multiturn=args.fewshot_as_multiturn,
gen_kwargs=args.gen_kwargs,
task_manager=task_manager,
verbosity=args.verbosity,
predict_only=args.predict_only,
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2],
fewshot_random_seed=args.seed[3],
cli_args=args,
datetime_str=datetime_str,
distributed_executor_backend='torchrun' if (torch.distributed.is_available() and torch.distributed.is_initialized()) else 'accelerate',
**request_caching_args,
)
if results is not None:
if args.log_samples:
samples = results.pop("samples")
else:
samples = None
dumped = json.dumps(results, indent=4, default=_handle_non_serializable)
if args.show_config:
print(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str)
if args.log_samples:
for task_name, config in results["configs"].items():
evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])
if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub:
evaluation_tracker.recreate_metadata_card()
return results, samples
return None, None