def concatenate()

in server/text_generation_server/models/mamba.py [0:0]
95 lines of code
12 McCabe index (conditional complexity)

    def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch":
        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        max_tokens = 0
        seqlen_offset = 0

        (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
        (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
        dtype = batches[0].inference_params.conv_states.dtype
        device = batches[0].inference_params.conv_states.device
        inference_params = new_inference_params(
            n_blocks=n_blocks,
            batch_size=total_batch_size,
            d_state=d_state,
            d_conv=d_conv,
            d_inner=d_inner,
            seqlen_offset=seqlen_offset,
            device=device,
            dtype=dtype,
        )

        # Batch tensors
        input_ids = None
        top_n_tokens_tensor = None

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0
        for i, batch in enumerate(batches):
            requests.extend(batch.requests)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            all_input_ids.extend(batch.all_input_ids)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)
            top_n_tokens.extend(batch.top_n_tokens)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # Create empty tensor
            # input_ids is always of shape [batch_size, 1]
            # We do not need to pad it
            if input_ids is None:
                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
            # Copy to correct indices
            input_ids[start_index:end_index] = batch.input_ids

            if top_n_tokens_tensor is None:
                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
                    total_batch_size,
                )
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length - batch.max_input_length
            ) * len(batch)

            inference_params.max_seqlen = max(
                inference_params.max_seqlen, batch.inference_params.max_seqlen
            )
            assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
            inference_params.seqlen_offset = max(
                inference_params.seqlen_offset, batch.inference_params.seqlen_offset
            )

            inference_params.conv_states[:, start_index:end_index] = (
                batch.inference_params.conv_states
            )
            inference_params.ssm_states[:, start_index:end_index] = (
                batch.inference_params.ssm_states
            )

            start_index = end_index

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            all_input_ids=all_input_ids,
            input_lengths=input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length,
            padding_right_offset=padding_right_offset,
            keys_head_dim_last=batches[0].keys_head_dim_last,
            max_tokens=max_tokens,
            inference_params=inference_params,
        )