in senteval/sst.py [0:0]
def run(self, params, batcher):
sst_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
for key in self.sst_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
sorted_data = sorted(zip(self.sst_data[key]['X'],
self.sst_data[key]['y']),
key=lambda z: (len(z[0]), z[1]))
self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
sst_embed[key]['X'] = []
for ii in range(0, len(self.sst_data[key]['y']), bsize):
batch = self.sst_data[key]['X'][ii:ii + bsize]
embeddings = batcher(params, batch)
sst_embed[key]['X'].append(embeddings)
sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
logging.info('Computed {0} embeddings'.format(key))
config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier}
clf = SplitClassifier(X={'train': sst_embed['train']['X'],
'valid': sst_embed['dev']['X'],
'test': sst_embed['test']['X']},
y={'train': sst_embed['train']['y'],
'valid': sst_embed['dev']['y'],
'test': sst_embed['test']['y']},
config=config_classifier)
devacc, testacc = clf.run()
logging.debug('\nDev acc : {0} Test acc : {1} for \
SST {2} classification\n'.format(devacc, testacc, self.task_name))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(sst_embed['dev']['X']),
'ntest': len(sst_embed['test']['X'])}