in eval/lmms_eval_wrapper.py [0:0]
def generate_until(self, requests: List[Instance]) -> List[str]:
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = self.tokenizer.encode(x[0])
return -len(toks), x[0]
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
for chunk in chunks:
contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids, task, split in zip(doc_id, task, split)]
images = self._prepare_visual_input(self.flatten(visuals))
messages = []
for i in range(len(contexts)):
current_context_str = contexts[i]
current_visuals_list = visuals[i] # List of PIL Images for this sample, or None
num_images_for_item = 0
if current_visuals_list: # Check if the list is not None and not empty
num_images_for_item = len(current_visuals_list)
# Prepend image tokens based on the number of images for the current item
image_tokens_prefix = self.tokenizer.image_token * num_images_for_item * self.model.cfg.mp_image_token_length
prompt_content = image_tokens_prefix + current_context_str
# Format text_data as a list of message dictionaries
messages_for_item = [{"role": "user", "content": prompt_content}]
messages.append(messages_for_item)
# # Process images; _prepare_visual_input returns a stacked tensor or None
# processed_images_tensor = self._prepare_visual_input(current_visuals_list) if current_visuals_list else None
# images.append(processed_images_tensor)
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.tokenizer(
prompts,
return_tensors="pt",
padding="longest",
padding_side="left",
truncation=True,
max_length=self.max_length
)
input_ids = inputs["input_ids"].to(self.device)
attention_mask = inputs["attention_mask"].to(self.device)
images = images.to(self.device)
# Extract generation parameters for the batch
# We use the gen_kwargs from the first item in the chunk, assuming they are uniform for the batch.
# lmms-eval groups requests by gen_kwargs, so this assumption should hold.
current_gen_kwargs = all_gen_kwargs[0] if all_gen_kwargs else {}
max_new_tokens = current_gen_kwargs.get("max_new_tokens", 50)
temperature = current_gen_kwargs.get("temperature", 0.0) # Default to greedy
top_p = current_gen_kwargs.get("top_p", 1.0)
# Check if greedy generation is explicitly requested or implied by temperature 0
greedy = current_gen_kwargs.get("do_sample", False) is False or temperature == 0.0
# Pass None for temperature/top_p if greedy, as some HF models expect this
gen_temperature = temperature if not greedy else None
gen_top_p = top_p if not greedy else None
# Generate
generated_ids_batch = self.model.generate(
input_ids,
images,
attention_mask,
max_new_tokens=max_new_tokens,
greedy=greedy,
temperature=gen_temperature,
top_p=gen_top_p,
)
# Decode generated sequences
# generated_ids_batch from model.generate usually contains only the generated tokens (excluding prompt)
generated_texts = self.tokenizer.batch_decode(
generated_ids_batch,
skip_special_tokens=True
)
res.extend(generated_texts)
pbar.update(len(contexts))
pbar.close()
# print(res)
# re_ords.get_original() will sort the results back to the original order of requests
return re_ords.get_original(res)