def preprocess()

in src/transformers/pipelines/question_answering.py [0:0]
93 lines of code
34 McCabe index (conditional complexity)

    def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
        # XXX: This is special, args_parser will not handle anything generator or dataset like
        # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
        # So we still need a little sanitation here.
        if isinstance(example, dict):
            example = SquadExample(None, example["question"], example["context"], None, None, None)

        if max_seq_len is None:
            max_seq_len = min(self.tokenizer.model_max_length, 384)
        if doc_stride is None:
            doc_stride = min(max_seq_len // 2, 128)

        if doc_stride > max_seq_len:
            raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")

        if not self.tokenizer.is_fast:
            features = squad_convert_examples_to_features(
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=max_seq_len,
                doc_stride=doc_stride,
                max_query_length=max_question_len,
                padding_strategy=PaddingStrategy.MAX_LENGTH,
                is_training=False,
                tqdm_enabled=False,
            )
        else:
            # Define the side we want to truncate / pad and the text/pair sorting
            question_first = self.tokenizer.padding_side == "right"

            encoded_inputs = self.tokenizer(
                text=example.question_text if question_first else example.context_text,
                text_pair=example.context_text if question_first else example.question_text,
                padding=padding,
                truncation="only_second" if question_first else "only_first",
                max_length=max_seq_len,
                stride=doc_stride,
                return_token_type_ids=True,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                return_special_tokens_mask=True,
            )
            # When the input is too long, it's converted in a batch of inputs with overflowing tokens
            # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
            # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
            # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
            # "num_span" is the number of output samples generated from the overflowing tokens.
            num_spans = len(encoded_inputs["input_ids"])

            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
            p_mask = [
                [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
                for span_id in range(num_spans)
            ]

            features = []
            for span_idx in range(num_spans):
                input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
                attention_mask_span_idx = (
                    encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
                )
                token_type_ids_span_idx = (
                    encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
                )
                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                if self.tokenizer.cls_token_id is not None:
                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
                    for cls_index in cls_indices:
                        p_mask[span_idx][cls_index] = 0
                submask = p_mask[span_idx]
                features.append(
                    SquadFeatures(
                        input_ids=input_ids_span_idx,
                        attention_mask=attention_mask_span_idx,
                        token_type_ids=token_type_ids_span_idx,
                        p_mask=submask,
                        encoding=encoded_inputs[span_idx],
                        # We don't use the rest of the values - and actually
                        # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
                        cls_index=None,
                        token_to_orig_map={},
                        example_index=0,
                        unique_id=0,
                        paragraph_len=0,
                        token_is_max_context=0,
                        tokens=[],
                        start_position=0,
                        end_position=0,
                        is_impossible=False,
                        qas_id=None,
                    )
                )

        for i, feature in enumerate(features):
            fw_args = {}
            others = {}
            model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]

            for k, v in feature.__dict__.items():
                if k in model_input_names:
                    if self.framework == "tf":
                        tensor = tf.constant(v)
                        if tensor.dtype == tf.int64:
                            tensor = tf.cast(tensor, tf.int32)
                        fw_args[k] = tf.expand_dims(tensor, 0)
                    elif self.framework == "pt":
                        tensor = torch.tensor(v)
                        if tensor.dtype == torch.int32:
                            tensor = tensor.long()
                        fw_args[k] = tensor.unsqueeze(0)
                else:
                    others[k] = v

            is_last = i == len(features) - 1
            yield {"example": example, "is_last": is_last, **fw_args, **others}