in atis.py [0:0]
def parse(data_file_name, out_file_name, language, single_q):
with open(args.qas_file, 'r') as f:
qas = json.load(f)
intent_questions = qas['intents']
slot_questions = qas['slots']
qas = {'title': "MultiATIS++", 'paragraphs': []}
misalignments = 0
# Load data
_, utterances, tag_rows, intent_rows = load_tsv(data_file_name)
for row in tqdm(range(len(utterances))):
paragraph = {
'context': '',
'qas': [],
'slots': []
}
# Ignore data id, as we want an id per question
words, tags, intents = utterances[row], tag_rows[row], intent_rows[row]
# Special cases for hi and tr
if language in ['hi', 'tr']:
intents = intents.replace(' airfare', '#atis_airfare')
intents = intents.replace(' airline', '#atis_airline')
intents = intents.replace(' flight', '#atis_flight')
intents = intents.replace(' flight_no', '#atis_flight_no')
intents = intents.split('#')
utterance = ''
if USE_INTENTS:
utterance = 'yes. no. '
prev_slot = ''
slot_annotations = {}
annotation = {}
if len(tags) != len(words):
misalignments += 1
continue
# Retrieve annotations
for t in range(len(tags)):
tag = tags[t]
word = words[t]
slot = ''
if tag != 'O':
# Remove B- or I-
slot = tag[2:]
if slot != prev_slot:
# Save previous annotation, if any
if annotation:
if annotation['slot'] not in slot_annotations:
slot_annotations[annotation['slot']] = [annotation]
else:
slot_annotations[annotation['slot']].append(annotation)
annotation = {
'slot': slot,
'answer_start': len(utterance),
'answer_stop': len(utterance) + len(word) + 1
}
else:
annotation['answer_stop'] = len(utterance) + len(word) + 1
utterance += word + ' '
prev_slot = slot
# Save last annotation
if annotation:
if annotation['slot'] not in slot_annotations:
slot_annotations[annotation['slot']] = [annotation]
else:
slot_annotations[annotation['slot']].append(annotation)
# Trim trailing space
utterance = utterance.rstrip()
paragraph['context'] = utterance
# Positive slot questions
for slot in slot_annotations:
answers = []
for answer in slot_annotations[slot]:
answers.append(
{
'text': utterance[answer['answer_start']:answer['answer_stop']].rstrip(),
'answer_start': answer['answer_start']}
)
append_question(paragraph, slot_questions, language, False, slot, answers, single_q)
# Negative slot questions
for i in slot_questions[language]:
if i in slot_annotations:
continue
append_question(paragraph, slot_questions, language, True, i, [], single_q)
if USE_INTENTS:
# Positive intent questions
for intent in intents:
append_question(
paragraph, intent_questions, language, False, intent, [{'text': 'yes', 'answer_start': 0}],
single_q, is_intent=True
)
# Negative intent questions
for i in intent_questions[language]:
if i == intent:
continue
append_question(
paragraph, intent_questions, language, True, i, [{'text': 'no', 'answer_start': 5}],
single_q, is_intent=True
)
qas['paragraphs'].append(paragraph)
print(f'Saving {language}...')
with open(out_file_name, 'w') as f:
json.dump(qas, f, indent=4)
print(f'{language} misalignments: {misalignments} out of {len(utterances)}')