in code/run_tacred.py [0:0]
def convert_examples_to_features(examples, label2id, max_seq_length, tokenizer, special_tokens, mode='text'):
"""Loads a data file into a list of `InputBatch`s."""
def get_special_token(w):
if w not in special_tokens:
special_tokens[w] = "[unused%d]" % (len(special_tokens) + 1)
return special_tokens[w]
num_tokens = 0
num_fit_examples = 0
num_shown_examples = 0
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
tokens = [CLS]
SUBJECT_START = get_special_token("SUBJ_START")
SUBJECT_END = get_special_token("SUBJ_END")
OBJECT_START = get_special_token("OBJ_START")
OBJECT_END = get_special_token("OBJ_END")
SUBJECT_NER = get_special_token("SUBJ=%s" % example.ner1)
OBJECT_NER = get_special_token("OBJ=%s" % example.ner2)
if mode.startswith("text"):
for i, token in enumerate(example.sentence):
if i == example.span1[0]:
tokens.append(SUBJECT_START)
if i == example.span2[0]:
tokens.append(OBJECT_START)
for sub_token in tokenizer.tokenize(token):
tokens.append(sub_token)
if i == example.span1[1]:
tokens.append(SUBJECT_END)
if i == example.span2[1]:
tokens.append(OBJECT_END)
if mode == "text_ner":
tokens = tokens + [SEP, SUBJECT_NER, SEP, OBJECT_NER, SEP]
else:
tokens.append(SEP)
else:
subj_tokens = []
obj_tokens = []
for i, token in enumerate(example.sentence):
if i == example.span1[0]:
tokens.append(SUBJECT_NER)
if i == example.span2[0]:
tokens.append(OBJECT_NER)
if (i >= example.span1[0]) and (i <= example.span1[1]):
for sub_token in tokenizer.tokenize(token):
subj_tokens.append(sub_token)
elif (i >= example.span2[0]) and (i <= example.span2[1]):
for sub_token in tokenizer.tokenize(token):
obj_tokens.append(sub_token)
else:
for sub_token in tokenizer.tokenize(token):
tokens.append(sub_token)
if mode == "ner_text":
tokens.append(SEP)
for sub_token in subj_tokens:
tokens.append(sub_token)
tokens.append(SEP)
for sub_token in obj_tokens:
tokens.append(sub_token)
tokens.append(SEP)
num_tokens += len(tokens)
if len(tokens) > max_seq_length:
tokens = tokens[:max_seq_length]
else:
num_fit_examples += 1
segment_ids = [0] * len(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
label_id = label2id[example.label]
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
if num_shown_examples < 20:
if (ex_index < 5) or (label_id > 0):
num_shown_examples += 1
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %d)" % (example.label, label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
logger.info("Average #tokens: %.2f" % (num_tokens * 1.0 / len(examples)))
logger.info("%d (%.2f %%) examples can fit max_seq_length = %d" % (num_fit_examples,
num_fit_examples * 100.0 / len(examples), max_seq_length))
return features