in senteval/sts.py [0:0]
def run(self, params, batcher):
results = {}
for dataset in self.datasets:
sys_scores = []
input1, input2, gs_scores = self.data[dataset]
for ii in range(0, len(gs_scores), params.batch_size):
batch1 = input1[ii:ii + params.batch_size]
batch2 = input2[ii:ii + params.batch_size]
# we assume get_batch already throws out the faulty ones
if len(batch1) == len(batch2) and len(batch1) > 0:
enc1 = batcher(params, batch1)
enc2 = batcher(params, batch2)
for kk in range(enc2.shape[0]):
sys_score = self.similarity(enc1[kk], enc2[kk])
sys_scores.append(sys_score)
results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
'spearman': spearmanr(sys_scores, gs_scores),
'nsamples': len(sys_scores)}
logging.debug('%s : pearson = %.4f, spearman = %.4f' %
(dataset, results[dataset]['pearson'][0],
results[dataset]['spearman'][0]))
weights = [results[dset]['nsamples'] for dset in results.keys()]
list_prs = np.array([results[dset]['pearson'][0] for
dset in results.keys()])
list_spr = np.array([results[dset]['spearman'][0] for
dset in results.keys()])
avg_pearson = np.average(list_prs)
avg_spearman = np.average(list_spr)
wavg_pearson = np.average(list_prs, weights=weights)
wavg_spearman = np.average(list_spr, weights=weights)
results['all'] = {'pearson': {'mean': avg_pearson,
'wmean': wavg_pearson},
'spearman': {'mean': avg_spearman,
'wmean': wavg_spearman}}
logging.debug('ALL (weighted average) : Pearson = %.4f, \
Spearman = %.4f' % (wavg_pearson, wavg_spearman))
logging.debug('ALL (average) : Pearson = %.4f, \
Spearman = %.4f\n' % (avg_pearson, avg_spearman))
return results