in split_data.py [0:0]
def split_data(data, senses):
#size of dataset before splitting
print(len(data))
#filter out these words from dev/test sets
ignored_tags = set(['obsolete', 'rare', 'archaic', 'dated', 'nonstandard', 'vulgar']) #historical?
random.shuffle(data)
#get label support for data
label_support = {}
for _, l, _ in data:
if l in label_support:
label_support[l] += 1
else:
label_support[l] = 1
train_split = []
zero_split = []
few_split = []
test_labels = set()
for sent, label, attrib in data:
#check for ignored tags in that word's sense
sense = senses[label]
if ', ' in sense['tags']: sense_tags = ', '.split(sense['tags'])
else: sense_tags = [sense['tags']]
#if not ignored tags...
key = get_key(label, use_pos=USE_POS)
if set(sense_tags).isdisjoint(ignored_tags) and label not in test_labels:
#put into zero shot or few shot test sets if not full
#note: only including polysemous words (word+pos) in eval sets
if label_support[label] == 1 and len(zero_split) < ZERO_SIZE:
sent = clean_eval_sent(sent) #so there is only one labeled example per eval sent
zero_split.append((sent, label, attrib))
test_labels.add(label)
elif label_support[label] > 1 and len(few_split) < FEW_SIZE:
sent = clean_eval_sent(sent) #so there is only one labeled example per eval sent
few_split.append((sent, label, attrib))
test_labels.add(label)
else:
train_split.append((sent, label, attrib))
else:
train_split.append((sent, label, attrib))
#split fewshot and zero-shot examples between dev and test
fs_dev_split = few_split[:FEW_SIZE//2]
zs_dev_split = zero_split[:ZERO_SIZE//2]
fs_test_split = few_split[FEW_SIZE//2:]
zs_test_split = zero_split[ZERO_SIZE//2:]
#size of dataset after splitting
print(len(train_split), len(fs_dev_split), len(zs_dev_split), len(fs_test_split), len(zs_test_split))
return train_split, fs_dev_split, zs_dev_split, fs_test_split, zs_test_split