in evaluation.py [0:0]
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
default_args = parse_eval_args()
if args is None and len(sys.argv) == 1:
print("┌───────────────────────────────────────────────────────────────────────────────┐")
print("│ Please provide arguments to evaluate the model. e.g. │")
print("│ `python evaluation.py --model lusxvr/nanoVLM-450M --tasks mmstar` │")
print("└───────────────────────────────────────────────────────────────────────────────┘")
sys.exit(1)
# If args were provided, override the defaults
if args:
for key, value in vars(args).items():
setattr(default_args, key, value)
args = default_args
if args.wandb_args:
if "name" not in args.wandb_args:
name = f"{args.model}_{args.model_args}_{utils.get_datetime_str(timezone=args.timezone)}"
name = utils.sanitize_long_string(name)
args.wandb_args += f",name={name}"
wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
# reset logger
eval_logger.remove()
eval_logger.add(sys.stdout, colorize=True, level=args.verbosity)
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["VERBOSITY"] = args.verbosity
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args_list = []
results_list = []
if args.config:
if not os.path.exists(args.config):
raise ValueError(f"Config file does not exist: {args.config}")
with open(args.config, "r") as file:
config_args = yaml.safe_load(file)
config_args = [config_args] if type(config_args) != list else config_args
# multiple configs, create args list first
for config in config_args:
args_copy = argparse.Namespace(**vars(args))
for key, value in config.items():
setattr(args_copy, key, value)
args_list.append(args_copy)
else:
args_list.append(args)
# initialize Accelerator only if not already in a distributed context
if torch.distributed.is_available() and torch.distributed.is_initialized():
accelerator = None
is_main_process = torch.distributed.get_rank() == 0
else:
kwargs_handler = InitProcessGroupKwargs(timeout=datetime.timedelta(seconds=6000))
accelerator = Accelerator(kwargs_handlers=[kwargs_handler])
if accelerator.is_main_process:
is_main_process = True
else:
is_main_process = False
for args in args_list:
try:
results, samples = cli_evaluate_single(args)
results_list.append(results)
if accelerator:
accelerator.wait_for_everyone()
elif torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.barrier()
if is_main_process and args.wandb_args:
try:
wandb_logger.post_init(results)
wandb_logger.log_eval_result()
if args.wandb_log_samples and samples is not None:
wandb_logger.log_eval_samples(samples)
except Exception as e:
eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
except Exception as e:
if args.verbosity == "DEBUG":
raise e
else:
traceback.print_exc()
eval_logger.error(f"Error during evaluation: {e}. Please set `--verbosity=DEBUG` to get more information.")
results_list.append(None)
for args, results in zip(args_list, results_list):
# cli_evaluate will return none if the process is not the main process (rank 0)
if results is not None:
print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}")
print(make_table(results))
if "groups" in results:
print(make_table(results, "groups"))
if args.wandb_args:
wandb_logger.run.finish()
return results_list