in lm_eval/models/neuralmagic.py [0:0]
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
The function to generate a certain number of new tokens
given a context.
This function is an adapted version of the original function from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
"""
if not requests:
return []
res = []
requests = [req.args for req in requests]
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size):
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
pbar = tqdm(total=len(requests))
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
):
inps = []
# make a deepcopy since we are changing arguments
request_args = copy.deepcopy(request_args)
self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
# add context (prompts) to the list
inps.append(context)
until = request_args.pop("until", ["<|endoftext|>"])
request_args.pop("do_sample", None)
request_args["temperature"] = request_args.get("temperature", 0)
# run inference (generate max_gen_toks tokens)
out = self.model(
sequences=inps,
max_new_tokens=self.max_gen_toks - 1,
stop=until,
**request_args,
)
for resp, (context, args_) in zip(out.generations, chunk):
text = resp.text
until_ = until
# split the text at the first occurrence of any of the until tokens
for term in until_:
if len(term) > 0:
text = text.split(term)[0]
res.append(text)
self.cache_hook.add_partial(
"generate_until", (context, {"until": until_}), text
)
pbar.update(1)
pbar.close()
return re_ord.get_original(res)