def load_spouse()

in parsers/Spouse/Spouse_Preprocess.py [0:0]


def load_spouse(data_folder):
    spans_df = pd.read_csv(Path(data_folder, 'spouse_data/spans.csv'))
    spouse_df = pd.read_csv(Path(data_folder, 'spouse_data/spouse_table.csv'))

    train_dataset, validation_dataset, test_dataset = {'positive': [], 'negative': []}, {'positive': [],
                                                                                         'negative': []}, {
                                                          'positive': [], 'negative': []}

    spans_map = {}
    for row in spans_df.iterrows():
        spans_map[int(row[1]['id'])] = {'char_start': int(row[1]['char_start']),
                                        'char_end': int(row[1]['char_end']) + 1}

    i = 0
    for row in spouse_df.iterrows():
        i += 1
        sample = row[1]
        original_text = sample['text']
        text = sample['text']
        label = int(sample['label'])
        append_to = 'positive' if label == 1 else 'negative'

        split = int(sample['split'])
        person1 = sample['person1']
        person2 = sample['person2_id']

        char_start1, char_end1 = spans_map[person1]['char_start'], spans_map[person1]['char_end']
        char_start2, char_end2 = spans_map[person2]['char_start'], spans_map[person2]['char_end']

        candidate1, candidate2 = text[char_start1:char_end1], text[char_start2:char_end2]

        # First replace full names with Alice and Bob, respectively
        replace1 = "Alex"
        replace2 = "Chris"

        if candidate1[-1] == ' ':
            replace1 = replace1 + ' '

        if candidate2[-1] == ' ':
            replace2 = replace2 + ' '

        # I assume that, if the 's is present, the full name is used
        if candidate1[-2:] == '\'s':
            replace1 = replace1 + '\'s'
        if candidate2[-1] == '\'s':
            replace2 = replace2 + '\'s'

        text = text.replace(candidate1, replace1)
        text = text.replace(candidate2, replace2)

        # Now split, if any, name with surname and replace each with Alice or Bob.
        # This is because sometimes the full name is not used
        replace1 = "Alex"
        replace2 = "Chris"

        names1 = candidate1.split(' ')

        for name in names1:
            if name != '' and name[0].isupper():
                text = text.replace(name, ' ' + replace1 + ' ')

        names2 = candidate2.split(' ')
        for name in names2:
            if name != '' and name[0].isupper():
                text = text.replace(name, ' ' + replace2 + ' ')

        dict_to_append = {
            'parsed': text,
            'original': original_text,
            'candidate-1': candidate1,
            'candidate-2': candidate2,
            'target': label,
            'highlighted': 0
        }

        if split == 0:
            train_dataset[append_to].append(dict_to_append)
        elif split == 1:
            validation_dataset[append_to].append(dict_to_append)
        elif split == 2:
            test_dataset[append_to].append(dict_to_append)
        else:
            print("ERROR")

    for dataset, dataset_type in [(train_dataset, 'train'), (validation_dataset, 'validation'), (test_dataset, 'test')]:
        shuffle(dataset['positive'])
        shuffle(dataset['negative'])

        with open(Path(data_folder, f'spouse_{dataset_type}_set.json'), 'w') as f:
            json.dump(dataset, f)

    return train_dataset, validation_dataset, test_dataset