in optimum/intel/openvino/modeling_decoder.py [0:0]
def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
batch_size = logits.shape[0]
if indicies.shape[0] != 1:
logits = logits[indicies]
if past_key_values and not self.stateful:
if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or (
self.config.model_type == "falcon" and self.config.new_decoder_architecture
):
past_key_values = tuple(
tuple(
(
past_state[indicies]
if not (self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio"))
else past_state[:, indicies, ...]
)
for past_state in layer_past
)
for layer_past in past_key_values
)
else:
past_key_values = tuple([past_state[indicies] for past_state in past_key_values])
if self.stateful:
self.next_beam_idx = (
self.next_beam_idx[indicies]
if self.next_beam_idx is not None
else np.arange(batch_size, dtype=int)[indicies]
)
self._second_iter_beam_search = True
return logits, past_key_values