in XLM/src/evaluation/glue.py [0:0]
def load_data(self, task):
"""
Load pair regression/classification bi-sentence tasks
"""
params = self.params
data = {splt: {} for splt in ['train', 'valid', 'test']}
dpath = os.path.join(params.data_path, 'eval', task)
self.n_sent = 1 if task in ['SST-2', 'CoLA'] else 2
for splt in ['train', 'valid', 'test']:
# load data and dictionary
data1 = load_binarized(os.path.join(dpath, '%s.s1.pth' % splt), params)
data2 = load_binarized(os.path.join(dpath, '%s.s2.pth' % splt), params) if self.n_sent == 2 else None
data['dico'] = data.get('dico', data1['dico'])
# set dictionary parameters
set_dico_parameters(params, data, data1['dico'])
if self.n_sent == 2:
set_dico_parameters(params, data, data2['dico'])
# create dataset
if self.n_sent == 1:
data[splt]['x'] = Dataset(data1['sentences'], data1['positions'], params)
else:
data[splt]['x'] = ParallelDataset(
data1['sentences'], data1['positions'],
data2['sentences'], data2['positions'],
params
)
# load labels
if splt != 'test' or task in ['MRPC']:
# read labels from file
with open(os.path.join(dpath, '%s.label' % splt), 'r') as f:
lines = [l.rstrip() for l in f]
# STS-B task
if task == 'STS-B':
assert all(0 <= float(x) <= 5 for x in lines)
y = [float(l) for l in lines]
# QQP
elif task == 'QQP':
UNK_LABEL = 0
lab2id = {x: i for i, x in enumerate(sorted(set(lines) - set([''])))}
y = [lab2id.get(x, UNK_LABEL) for x in lines]
# other tasks
else:
lab2id = {x: i for i, x in enumerate(sorted(set(lines)))}
y = [lab2id[x] for x in lines]
data[splt]['y'] = torch.LongTensor(y)
assert len(data[splt]['x']) == len(data[splt]['y'])
# compute weights for weighted training
if task != 'STS-B' and params.weighted_training:
weights = torch.FloatTensor([
1.0 / (data['train']['y'] == i).sum().item()
for i in range(len(lab2id))
]).cuda()
self.weights = weights / weights.sum()
else:
self.weights = None
return data