in nlp_amazon_review/GluonNLP_BERT/src/bert/data/transform.py [0:0]
def __call__(self, line):
"""Perform transformation for sequence pairs or single sequences.
The transformation is processed in the following steps:
- tokenize the input sequences
- insert [CLS], [SEP] as necessary
- generate type ids to indicate whether a token belongs to the first
sequence or the second sequence.
- generate valid length
For sequence pairs, the input is a tuple of 3 strings:
text_a, text_b and label.
Inputs:
text_a: 'is this jacksonville ?'
text_b: 'no it is not'
label: '0'
Tokenization:
text_a: 'is this jack ##son ##ville ?'
text_b: 'no it is not .'
Processed:
tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
valid_length: 14
label: 0
For single sequences, the input is a tuple of 2 strings: text_a and label.
Inputs:
text_a: 'the dog is hairy .'
label: '1'
Tokenization:
text_a: 'the dog is hairy .'
Processed:
text_a: '[CLS] the dog is hairy . [SEP]'
type_ids: 0 0 0 0 0 0 0
valid_length: 7
label: 1
Parameters
----------
line: tuple of str
Input strings. For sequence pairs, the input is a tuple of 3 strings:
(text_a, text_b, label). For single sequences, the input is a tuple
of 2 strings: (text_a, label).
Returns
-------
np.array: input token ids in 'int32', shape (batch_size, seq_length)
np.array: valid length in 'int32', shape (batch_size,)
np.array: input token type ids in 'int32', shape (batch_size, seq_length)
np.array: classification task: label id in 'int32', shape (batch_size, 1),
regression task: label in 'float32', shape (batch_size, 1)
"""
if self.has_label:
input_ids, valid_length, segment_ids = self._bert_xform(line[:-1])
label = line[-1]
# map to int if class labels are available
if self.class_labels:
label = self._label_map[label]
label = np.array([label], dtype=self._label_dtype)
return input_ids, valid_length, segment_ids, label
else:
return self._bert_xform(line)