in src/transformers/pipelines/question_answering.py [0:0]
def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
# XXX: This is special, args_parser will not handle anything generator or dataset like
# For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
# So we still need a little sanitation here.
if isinstance(example, dict):
example = SquadExample(None, example["question"], example["context"], None, None, None)
if max_seq_len is None:
max_seq_len = min(self.tokenizer.model_max_length, 384)
if doc_stride is None:
doc_stride = min(max_seq_len // 2, 128)
if doc_stride > max_seq_len:
raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
if not self.tokenizer.is_fast:
features = squad_convert_examples_to_features(
examples=[example],
tokenizer=self.tokenizer,
max_seq_length=max_seq_len,
doc_stride=doc_stride,
max_query_length=max_question_len,
padding_strategy=PaddingStrategy.MAX_LENGTH,
is_training=False,
tqdm_enabled=False,
)
else:
# Define the side we want to truncate / pad and the text/pair sorting
question_first = self.tokenizer.padding_side == "right"
encoded_inputs = self.tokenizer(
text=example.question_text if question_first else example.context_text,
text_pair=example.context_text if question_first else example.question_text,
padding=padding,
truncation="only_second" if question_first else "only_first",
max_length=max_seq_len,
stride=doc_stride,
return_token_type_ids=True,
return_overflowing_tokens=True,
return_offsets_mapping=True,
return_special_tokens_mask=True,
)
# When the input is too long, it's converted in a batch of inputs with overflowing tokens
# and a stride of overlap between the inputs. If a batch of inputs is given, a special output
# "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
# Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
# "num_span" is the number of output samples generated from the overflowing tokens.
num_spans = len(encoded_inputs["input_ids"])
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
p_mask = [
[tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
for span_id in range(num_spans)
]
features = []
for span_idx in range(num_spans):
input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
attention_mask_span_idx = (
encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
)
token_type_ids_span_idx = (
encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
)
# keep the cls_token unmasked (some models use it to indicate unanswerable questions)
if self.tokenizer.cls_token_id is not None:
cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
for cls_index in cls_indices:
p_mask[span_idx][cls_index] = 0
submask = p_mask[span_idx]
features.append(
SquadFeatures(
input_ids=input_ids_span_idx,
attention_mask=attention_mask_span_idx,
token_type_ids=token_type_ids_span_idx,
p_mask=submask,
encoding=encoded_inputs[span_idx],
# We don't use the rest of the values - and actually
# for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
cls_index=None,
token_to_orig_map={},
example_index=0,
unique_id=0,
paragraph_len=0,
token_is_max_context=0,
tokens=[],
start_position=0,
end_position=0,
is_impossible=False,
qas_id=None,
)
)
for i, feature in enumerate(features):
fw_args = {}
others = {}
model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]
for k, v in feature.__dict__.items():
if k in model_input_names:
if self.framework == "tf":
tensor = tf.constant(v)
if tensor.dtype == tf.int64:
tensor = tf.cast(tensor, tf.int32)
fw_args[k] = tf.expand_dims(tensor, 0)
elif self.framework == "pt":
tensor = torch.tensor(v)
if tensor.dtype == torch.int32:
tensor = tensor.long()
fw_args[k] = tensor.unsqueeze(0)
else:
others[k] = v
is_last = i == len(features) - 1
yield {"example": example, "is_last": is_last, **fw_args, **others}