in mm_dst/gpt2_dst/utils/convert.py [0:0]
def convert_json_to_flattened(
input_path_json,
output_path_predict,
output_path_target,
len_context=2,
use_multimodal_contexts=True,
input_path_special_tokens='',
output_path_special_tokens=''):
"""
Input: JSON representation of the dialogs
Output: line-by-line stringified representation of each turn
"""
with open(input_path_json, 'r') as f_in:
data = json.load(f_in)['dialogue_data']
predicts = []
targets = []
if input_path_special_tokens != '':
with open(input_path_special_tokens, 'r') as f_in:
special_tokens = json.load(f_in)
else:
special_tokens = {
"eos_token": END_OF_SENTENCE,
"additional_special_tokens": [
END_OF_BELIEF
]
}
if use_multimodal_contexts:
special_tokens = {
"eos_token": END_OF_SENTENCE,
"additional_special_tokens": [
END_OF_BELIEF,
START_OF_MULTIMODAL_CONTEXTS,
END_OF_MULTIMODAL_CONTEXTS
]
}
if output_path_special_tokens != '':
# If a new output path for special tokens is given,
# we track new OOVs
oov = set()
for _, dialog in enumerate(data):
prev_asst_uttr = None
lst_context = []
for turn in dialog[FIELDNAME_DIALOG]:
user_uttr = turn[FIELDNAME_USER_UTTR].replace('\n', ' ').strip()
user_belief = turn[FIELDNAME_BELIEF_STATE]
asst_uttr = turn[FIELDNAME_ASST_UTTR].replace('\n', ' ').strip()
# Format main input context
context = ''
if prev_asst_uttr:
context += f'System : {prev_asst_uttr} '
context += f'User : {user_uttr}'
prev_asst_uttr = asst_uttr
# Add multimodal contexts
if use_multimodal_contexts:
visual_objects = turn[FIELDNAME_VISUAL_OBJECTS]
context += ' ' + represent_visual_objects(visual_objects)
# Concat with previous contexts
lst_context.append(context)
context = ' '.join(lst_context[-len_context:])
# Format belief state
belief_state = []
for bs_per_frame in user_belief:
str_belief_state_per_frame = "{act} [ {slot_values} ]".format(
act=bs_per_frame['act'].strip(),
slot_values=', '.join(
[f'{kv[0].strip()} = {kv[1].strip()}'
for kv in bs_per_frame['slots']])
)
belief_state.append(str_belief_state_per_frame)
# Track OOVs
if output_path_special_tokens != '':
oov.add(bs_per_frame['act'])
for kv in bs_per_frame['slots']:
slot_name = kv[0]
oov.add(slot_name)
# slot_name, slot_value = kv[0].strip(), kv[1].strip()
# oov.add(slot_name)
# oov.add(slot_value)
str_belief_state = ' '.join(belief_state)
# Format the main input
predict = TEMPLATE_PREDICT.format(
context=context,
START_BELIEF_STATE=START_BELIEF_STATE,
)
predicts.append(predict)
# Format the main output
target = TEMPLATE_TARGET.format(
context=context,
START_BELIEF_STATE=START_BELIEF_STATE,
belief_state=str_belief_state,
END_OF_BELIEF=END_OF_BELIEF,
response=asst_uttr,
END_OF_SENTENCE=END_OF_SENTENCE
)
targets.append(target)
# Create a directory if it does not exist
directory = os.path.dirname(output_path_predict)
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
directory = os.path.dirname(output_path_target)
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# Output into text files
with open(output_path_predict, 'w') as f_predict:
X = '\n'.join(predicts)
f_predict.write(X)
with open(output_path_target, 'w') as f_target:
Y = '\n'.join(targets)
f_target.write(Y)
if output_path_special_tokens != '':
# Create a directory if it does not exist
directory = os.path.dirname(output_path_special_tokens)
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
with open(output_path_special_tokens, 'w') as f_special_tokens:
# Add oov's (acts and slot names, etc.) to special tokens as well
special_tokens['additional_special_tokens'].extend(list(oov))
json.dump(special_tokens, f_special_tokens)