in 07_sparkml/experiment.py [0:0]
def eval(labelpred):
'''
data = (label, pred)
data[0] = label
data[1] = pred
'''
cancel = labelpred.filter(lambda data: data[1] < 0.7)
nocancel = labelpred.filter(lambda data: data[1] >= 0.7)
corr_cancel = cancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
corr_nocancel = nocancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
cancel_denom = cancel.count()
nocancel_denom = nocancel.count()
if cancel_denom == 0:
cancel_denom = 1
if nocancel_denom == 0:
nocancel_denom = 1
totsqe = labelpred.map(
lambda data: (data[0] - data[1]) * (data[0] - data[1])
).sum()
rmse = np.sqrt(totsqe / float(cancel.count() + nocancel.count()))
return {
'rmse': rmse,
'total_cancel': cancel.count(),
'correct_cancel': float(corr_cancel) / cancel_denom,
'total_noncancel': nocancel.count(),
'correct_noncancel': float(corr_nocancel) / nocancel_denom
}