lmms_eval/evaluator.py

import random import itertools import json import collections import sys import inspect from tqdm import tqdm import torch import logging import numpy as np from datasets import Image, Sequence import lmms_eval.api import lmms_eval.tasks import lmms_eval.models import lmms_eval.api.metrics import lmms_eval.api.registry from lmms_eval.utils import ( positional_deprecated, run_task_tests, make_table, create_iterator, get_git_commit_hash, simple_parse_args_string, ) eval_logger = logging.getLogger("lmms-eval") @positional_deprecated def simple_evaluate( model, model_args=None, tasks=[], num_fewshot=None, batch_size=None, device=None, limit=None, bootstrap_iters: int = 100000, check_integrity: bool = False, show_task_to_terminal: bool = False, log_samples: bool = True, gen_kwargs: str = None, cli_args=None, # Bo: put args into more functions (cost 48 Bytes per call) ): """Instantiate and evaluate a model on a list of tasks. :param model: Union[str, LMM] Name of model or LMM object, see lmms_eval.models.get_model :param model_args: Optional[str] String arguments for each model class, see LMM.create_from_arg_string. Ignored if `model` argument is a LMM object. :param tasks: list[Union[str, Task]] List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. :param num_fewshot: int Number of examples in few-shot context :param batch_size: int or str, optional Batch size for model :param device: str, optional PyTorch device (e.g. "cpu" or "cuda:0") for running models :param limit: int or float, optional Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. :param bootstrap_iters: Number of iterations for bootstrap statistics :param check_integrity: bool Whether to run the relevant part of the test suite for the tasks :param show_task_to_terminal: bool If True, write out an example document and model input for checking task integrity :param log_samples: bool If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis :param gen_kwargs: str String arguments for model generation Ignored for all tasks with loglikelihood output_type :return Dictionary of results """ random.seed(0) np.random.seed(1234) torch.manual_seed(1234) # TODO: this may affect training runs that are run with evaluation mid-run. assert tasks != [], "No tasks specified, or no tasks found. Please verify the task names." if gen_kwargs: gen_kwargs = simple_parse_args_string(gen_kwargs) eval_logger.warning(f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.") if gen_kwargs == "": gen_kwargs = None if model_args is None: model_args = "" lm = lmms_eval.api.registry.get_model(model).create_from_arg_string( model_args, { "batch_size": batch_size, "device": device, }, ) task_dict = lmms_eval.tasks.get_task_dict(tasks, model_name=model) for task_name in task_dict.keys(): task_obj = task_dict[task_name] if type(task_obj) == tuple: group, task_obj = task_obj if task_obj is None: continue lm.task_dict[task_name] = task_obj.dataset config = task_obj._config if config["output_type"] == "generate_until" and gen_kwargs: config["generation_kwargs"].update(gen_kwargs) if num_fewshot is not None: if config["num_fewshot"] == 0: eval_logger.info(f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored.") else: default_num_fewshot = config["num_fewshot"] eval_logger.warning(f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}") task_obj._config["num_fewshot"] = num_fewshot if check_integrity: run_task_tests(task_list=tasks) results = evaluate( lm=lm, task_dict=task_dict, limit=limit, bootstrap_iters=bootstrap_iters, show_task_to_terminal=show_task_to_terminal, log_samples=log_samples, cli_args=cli_args, ) if lm.rank == 0: # add info about the model and few shot config results["model_configs"] = { "model": model if isinstance(model, str) else model.model.config._name_or_path, "model_args": model_args, "batch_size": batch_size, "device": device, "limit": limit, "bootstrap_iters": bootstrap_iters, "gen_kwargs": gen_kwargs, } results["git_hash"] = get_git_commit_hash() return results else: return None decontaminate_suffix = "_decontaminate" @positional_deprecated def evaluate( lm, task_dict, limit=None, bootstrap_iters: int = 100000, show_task_to_terminal: bool = False, log_samples: bool = True, cli_args=None, ): """Instantiate and evaluate a model on a list of tasks. :param lm: obj Language Model :param task_dict: dict[str, Task] Dictionary of tasks. Tasks will be taken to have name type(task).config.task . :param limit: int, optional Limit the number of examples per task (only use this for testing) :param bootstrap_iters: Number of iterations for bootstrap statistics :param show_task_to_terminal: bool If True, write out an example document and model input for checking task integrity :param log_samples: bool If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis :return Dictionary of results """ # stores the final result for each task, for each metric/filter pair. results = collections.defaultdict(dict) # Tracks each task's version. versions = collections.defaultdict(dict) # Tracks the YAML configs of all chosen tasks. configs = collections.defaultdict(dict) # logs info about each document evaluated. samples = collections.defaultdict(list) # tracks all Instances/requests a model must generate output on. requests = collections.defaultdict(list) # Aggregated task scores presented with groups results_agg = collections.defaultdict(dict) # Aggregated groups scores only groups_agg = collections.defaultdict(dict) # stores the amount to pad out reqs per req. type so that # number of fwd passes per distributed rank is equal padding_requests = collections.defaultdict(int) # store the hierarchy to do proper ordering task_hierarchy = collections.defaultdict(list) # store the ordering of tasks and groups task_order = collections.defaultdict(int) task_group_alias = collections.defaultdict(dict) # store num-fewshot value per task num_fewshot = collections.defaultdict(int) # get lists of each type of request for task_name, task in task_dict.items(): if type(task) == tuple: group_name, task = task task_hierarchy[group_name].append(task_name) versions[group_name] = "N/A" else: group_name = None task_hierarchy[task_name] = [] if task is None: continue versions[task_name] = task.VERSION configs[task_name] = dict(task.dump_config()) if "num_fewshot" in configs[task_name]: n_shot = configs[task_name]["num_fewshot"] else: n_shot = 0 num_fewshot[task_name] = n_shot if "task_alias" in configs[task_name]: task_group_alias[task_name] = configs[task_name]["task_alias"] if ("group_alias" in configs[task_name]) and (group_name not in task_group_alias) and (group_name is not None): task_group_alias[group_name] = configs[task_name]["group_alias"] if limit is not None: if task.has_test_docs(): task_docs = task.test_docs() elif task.has_validation_docs(): task_docs = task.validation_docs() else: raise RuntimeError("Task has neither test_docs nor validation_docs") limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit) task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size) eval_logger.debug(f"Task: {task_name}; number of requests on rank {lm.rank}: {len(task.instances)}") if show_task_to_terminal: for inst in task.instances: # print the prompt for the first few documents if inst.doc_id < 1: eval_logger.info( f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\ \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)" ) eval_logger.info(f"Request: {str(inst)}") # aggregate Instances by LMM method requested to get output. for instance in task.instances: reqtype = instance.request_type requests[reqtype].append(instance) if lm.world_size > 1: instances_rnk = torch.tensor(len(task._instances), device=lm.device) gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks) numpad = max(gathered_item) - gathered_item[lm.rank] padding_requests[task.OUTPUT_TYPE] += numpad ### Run LMM on inputs, get all outputs ### # execute each type of request for reqtype, reqs in requests.items(): eval_logger.info("Running {} requests".format(reqtype)) # create `K` copies of each request `req` based off `K = req.repeats` cloned_reqs = [] for req in reqs: cloned_reqs.extend([req] * req.repeats) if (lm.world_size > 1) and (padding_requests[reqtype] > 0): for _ in range(padding_requests[reqtype]): cloned_reqs.extend([req] * req.repeats) # run requests through model resps = getattr(lm, reqtype)(cloned_reqs) # put responses from model into a list of length K for each request. for x, req in zip(resps, cloned_reqs): req.resps.append(x) if lm.world_size > 1: lm.accelerator.wait_for_everyone() ### Postprocess outputs ### # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) for task_name, task in task_dict.items(): if type(task) == tuple: group, task = task if task is None: continue task.apply_filters() ### Collect values of metrics on all datapoints ### vals = collections.defaultdict(list) # unpack results and sort back in order and return control to Task for task_name, task in task_dict.items(): if type(task) == tuple: group, task = task if task is None: continue # TODO: make it possible to use a different metric per filter # iterate over different filters used for key in task.instances[0].filtered_resps.keys(): # hack: remove image columns to speed avoid loading images and speed up postprocessing # reason: doc_iterator will actually load image if it's in the doc. docs = task.test_docs() if task.has_test_docs() else task.validation_docs() if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name: remove_cols = [] features = docs.features # If it is an Image instance or a Sequence of Image instance. Remove it for feature in features: if isinstance(features[feature], Image): remove_cols.append(feature) elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): remove_cols.append(feature) if remove_cols: docs = docs.remove_columns(remove_cols) doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size) # Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting # doc_iterator, doc_iterator_for_counting = itertools.tee(doc_iterator) # Don't use above one, this would crash if doc_iterator_for_counting contains too many objects and very slow doc_iterator_for_counting = itertools.islice(range(len(task.test_docs())), lm.rank, limit, lm.world_size) if task.has_test_docs() else itertools.islice(range(len(task.validation_docs())), lm.rank, limit, lm.world_size) total_docs = sum(1 for _ in doc_iterator_for_counting) pbar = tqdm(total=total_docs, desc=f"Postprocessing", disable=(lm.rank != 0)) for doc_id, doc in doc_iterator: # subset instances to only this document id ; sort by idx requests = list(filter(lambda x: x.doc_id == doc_id, task.instances)) requests.sort(key=lambda x: x.idx) metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests]) if log_samples: target = task.doc_to_target(doc) example = { "doc_id": doc_id, "target": target, "doc": doc, "arguments": [tuple(a for a in req.args if isinstance(a, (int, str))) for req in requests], # do not include image "resps": [req.resps for req in requests], "filtered_resps": [req.filtered_resps[key] for req in requests], } example.update(metrics) samples[task_name].append(example) for metric, value in metrics.items(): vals[(task_name, key, metric)].append(value) pbar.update(1) pbar.close() if lm.world_size > 1: # if multigpu, then gather data across all ranks # first gather logged samples across all ranks for task_name, task_samples in list(samples.items()): full_samples = [None] * lm.world_size torch.distributed.all_gather_object(full_samples, task_samples) samples[task_name] = list(itertools.chain.from_iterable(full_samples)) # then collect metrics across all ranks vals_torch = collections.defaultdict(list) for (task_name, key, metric), items in vals.items(): numitem = 0 if type(items[0]) == tuple: numitem = len(items[0]) if isinstance(items[0], (str, list, dict)): # handle the string case gathered_items = [None] * lm.accelerator.num_processes torch.distributed.all_gather_object(gathered_items, items) gathered_item = list(itertools.chain.from_iterable(gathered_items)) else: # distributed gather requires all ranks to have same dimensions # so we pad out with float32 min value pad_value = torch.finfo(torch.float32).min metrics_tensor = torch.tensor(items, device=lm.device) original_dtype = metrics_tensor.dtype # store original dtype torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index=pad_value) gathered_item = lm.accelerator.gather(torch_device_tensor) if numitem > 0: gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value] else: gathered_filtered = gathered_item[gathered_item != pad_value] gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist() # reconvert if we were passed a tuple of values if numitem > 0: gathered_item = [tuple(g) for g in gathered_item] if lm.rank == 0: vals_torch[(task_name, key, metric)] = gathered_item vals = vals_torch if lm.rank == 0: ### Get task ordering for correct sample-wide aggregation group_to_task = {} for group in task_hierarchy.keys(): if group not in task_order: task_order[group] = 0 if len(task_hierarchy[group]) > 0: group_to_task[group] = task_hierarchy[group].copy() for task in task_hierarchy[group]: if task in task_order: task_order[task] += 1 else: task_order[task] = 1 + task_order[group] if task in task_hierarchy: group_to_task[group].remove(task) group_to_task[group].extend(task_hierarchy[task]) task_to_group = {} for group in group_to_task: for task in group_to_task[group]: if task in task_to_group: task_to_group[task].append(group) else: task_to_group[task] = [group] ### Aggregate results over all datapoints ### # aggregate results ; run bootstrap CIs for (task_name, key, metric), items in vals.items(): task = task_dict[task_name] metric_key = metric + "," + key if type(task) == tuple: group_name, task = task else: group_name = None if metric not in task.aggregation(): continue agg_fn = task.aggregation()[metric] # Bo: for models that need to know the args to save to correct path if inspect.getfullargspec(agg_fn).args == ["results", "args"]: results[task_name][metric_key] = agg_fn(items, cli_args) else: # Bo: for models only need agg items results[task_name][metric_key] = agg_fn(items) results[task_name]["samples"] = len(items) # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap # so we run them less iterations. still looking for a cleaner way to do this if bootstrap_iters > 0: stderr = lmms_eval.api.metrics.stderr_for_metric( metric=task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 100) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters, ) if stderr is not None and len(items) > 1: results[task_name][metric + "_stderr" + "," + key] = stderr(items) else: results[task_name][metric + "_stderr" + "," + key] = "N/A" if bool(results): for group, task_list in reversed(task_hierarchy.items()): if task_list == []: total_size = results[group]["samples"] else: total_size = 0 for task in task_list: metrics = results[task] current_size = metrics.pop("samples") # TODO: There should be a way for users # to toggle between weighted and # unweighted averaging # For unweighted averaging, use: # current_size = 1 all_stderr = [] for metric in [key for key in metrics.keys() if "_stderr" not in key]: stderr = "_stderr,".join(metric.split(",")) stderr_score = results[task][stderr] var_score = stderr_score**2 if stderr_score != "N/A" else 0 metric_score = results[task][metric] all_stderr.append(stderr) if metric_score is None: results[group][metric] = None results[group][stderr] = 0 continue if metric in results[group]: results[group][metric] = (results[group][metric] * total_size + metric_score * current_size) / (total_size + current_size) # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$ results[group][stderr] = ((total_size - 1) * results[group][stderr] + (current_size - 1) * var_score) / (total_size + current_size - 1) + total_size * current_size / ( (total_size + current_size) * (total_size + current_size - 1) ) * (results[group][metric] - metric_score) ** 2 else: results[group][metric] = metric_score results[group][stderr] = var_score total_size += current_size for stderr in all_stderr: results[group][stderr] = np.sqrt(results[group][stderr]) results[group]["samples"] = total_size def print_tasks(task_hierarchy, task_order, task_version, task_group_alias): results_agg = collections.defaultdict(dict) groups_agg = collections.defaultdict(dict) for group_name, task_list in task_hierarchy.items(): order = task_order[group_name] results_agg[group_name] = results[group_name].copy() results_agg[group_name]["tab"] = order if (order < max(task_order.values())) and (len(task_list) > 0): groups_agg[group_name] = results[group_name].copy() groups_agg[group_name]["tab"] = order if task_list != []: for task in sorted(task_list): if task in task_hierarchy: _task_hierarchy = {task: task_hierarchy[task]} else: _task_hierarchy = {task: []} _results_agg, _groups_agg, task_version = print_tasks(_task_hierarchy, task_order, task_version, task_group_alias) results_agg = {**results_agg, **_results_agg} groups_agg = {**groups_agg, **_groups_agg} return results_agg, groups_agg, task_version results_agg, groups_agg, versions = print_tasks(task_hierarchy, task_order, versions, task_group_alias) for task in results_agg: task_results = results_agg[task] if "samples" in task_results: task_results.pop("samples") tab_string = "" if "tab" in task_results: tab = task_results.pop("tab") tab_string = " " * tab + "- " if tab > 0 else "" if task in task_group_alias: task_alias = task_group_alias[task] results_agg[task]["alias"] = tab_string + task_alias else: results_agg[task]["alias"] = tab_string + task for group in groups_agg: group_results = groups_agg[group] if "samples" in group_results: group_results.pop("samples") tab_string = "" if "tab" in group_results: tab = group_results.pop("tab") tab_string = " " * tab + "- " if tab > 0 else "" if group in task_group_alias: group_alias = task_group_alias[group] groups_agg[group]["alias"] = tab_string + group_alias else: groups_agg[group]["alias"] = tab_string + group for group_name, task_list in task_hierarchy.items(): if task_list != []: num_fewshot[group_name] = num_fewshot[task_list[0]] results_dict = { "results": dict(results_agg.items()), **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), "configs": dict(sorted(configs.items())), "versions": dict(sorted(versions.items())), "n-shot": dict(sorted(num_fewshot.items())), } if log_samples: results_dict["samples"] = dict(samples) return results_dict else: return None

lmms_eval/evaluator.py (403 lines of code) (raw):