in code/scripts/utils.py [0:0]
def parallel_icsl_transform(sample, vocabulary, label2idx, intent2idx, bert_tokenizer):
out_sample = []
target = ['[CLS]']
bert_tags = []
for w, tag in zip(sample[1].split(), sample[2].split()):
bert_toks = bert_tokenizer(w)
target.extend(bert_toks)
if tag.startswith('B'):
cont_tag = 'I' + tag[1:]
bert_tags.extend([tag] + [cont_tag] * (len(bert_toks) - 1))
else:
bert_tags.extend([tag] * len(bert_toks))
target += ['[SEP]']
bert_tags += [PAD]
source = ['[CLS]'] + bert_tokenizer(sample[0]) + ['[SEP]']
# add source ids
out_sample += [[vocabulary[tok] for tok in source]]
# add target ids
out_sample += [[vocabulary[tok] for tok in target]]
# add slot labels
out_sample += [[label2index(label2idx, tag) for tag in bert_tags]]
# add intent label
out_sample += [label2index(intent2idx, sample[3])]
# add source valid length
out_sample += [len(source)]
# add target valid length
out_sample += [len(target)]
return out_sample