in parsers/Spouse/Spouse_Preprocess.py [0:0]
def load_spouse(data_folder):
spans_df = pd.read_csv(Path(data_folder, 'spouse_data/spans.csv'))
spouse_df = pd.read_csv(Path(data_folder, 'spouse_data/spouse_table.csv'))
train_dataset, validation_dataset, test_dataset = {'positive': [], 'negative': []}, {'positive': [],
'negative': []}, {
'positive': [], 'negative': []}
spans_map = {}
for row in spans_df.iterrows():
spans_map[int(row[1]['id'])] = {'char_start': int(row[1]['char_start']),
'char_end': int(row[1]['char_end']) + 1}
i = 0
for row in spouse_df.iterrows():
i += 1
sample = row[1]
original_text = sample['text']
text = sample['text']
label = int(sample['label'])
append_to = 'positive' if label == 1 else 'negative'
split = int(sample['split'])
person1 = sample['person1']
person2 = sample['person2_id']
char_start1, char_end1 = spans_map[person1]['char_start'], spans_map[person1]['char_end']
char_start2, char_end2 = spans_map[person2]['char_start'], spans_map[person2]['char_end']
candidate1, candidate2 = text[char_start1:char_end1], text[char_start2:char_end2]
# First replace full names with Alice and Bob, respectively
replace1 = "Alex"
replace2 = "Chris"
if candidate1[-1] == ' ':
replace1 = replace1 + ' '
if candidate2[-1] == ' ':
replace2 = replace2 + ' '
# I assume that, if the 's is present, the full name is used
if candidate1[-2:] == '\'s':
replace1 = replace1 + '\'s'
if candidate2[-1] == '\'s':
replace2 = replace2 + '\'s'
text = text.replace(candidate1, replace1)
text = text.replace(candidate2, replace2)
# Now split, if any, name with surname and replace each with Alice or Bob.
# This is because sometimes the full name is not used
replace1 = "Alex"
replace2 = "Chris"
names1 = candidate1.split(' ')
for name in names1:
if name != '' and name[0].isupper():
text = text.replace(name, ' ' + replace1 + ' ')
names2 = candidate2.split(' ')
for name in names2:
if name != '' and name[0].isupper():
text = text.replace(name, ' ' + replace2 + ' ')
dict_to_append = {
'parsed': text,
'original': original_text,
'candidate-1': candidate1,
'candidate-2': candidate2,
'target': label,
'highlighted': 0
}
if split == 0:
train_dataset[append_to].append(dict_to_append)
elif split == 1:
validation_dataset[append_to].append(dict_to_append)
elif split == 2:
test_dataset[append_to].append(dict_to_append)
else:
print("ERROR")
for dataset, dataset_type in [(train_dataset, 'train'), (validation_dataset, 'validation'), (test_dataset, 'test')]:
shuffle(dataset['positive'])
shuffle(dataset['negative'])
with open(Path(data_folder, f'spouse_{dataset_type}_set.json'), 'w') as f:
json.dump(dataset, f)
return train_dataset, validation_dataset, test_dataset