in code/scripts/utils.py [0:0]
def icsl_transform(sample, vocabulary, label2idx, intent2idx, bert_tokenizer):
eid = int(sample[3])
out_sample = []
tag_alignment = []
bert_tokens = ['[CLS]']
bert_tags = []
for w, tag in zip(sample[0].split(), sample[1].split()):
tag_alignment.append(len(bert_tags))
bert_toks = bert_tokenizer(w)
bert_tokens.extend(bert_toks)
if tag.startswith('B'):
cont_tag = 'I' + tag[1:]
bert_tags.extend([tag] + [cont_tag] * (len(bert_toks) - 1))
else:
bert_tags.extend([tag] * len(bert_toks))
bert_tokens += ['[SEP]']
bert_tags += [PAD]
# add example id
out_sample += [eid]
# add token ids
out_sample += [[vocabulary[tok] for tok in bert_tokens]]
# add slot labels
out_sample += [[label2index(label2idx, tag) for tag in bert_tags]]
# add intent label
out_sample += [label2index(intent2idx, sample[2])]
# add valid length
valid_len = len(bert_tokens)
out_sample += [valid_len]
return out_sample, tag_alignment