in cost-based-ml/cost_based_ml.py [0:0]
def main():
(parser, options) = parse_options()
output_uri_s3 = options.output_uri_s3
costs = { 'tn': options.true_neg, 'tp': options.true_pos, 'fn': options.false_neg, 'fp': options.false_pos }
bucket, key = None, None
if not batch_predictions_already_evaluated(parser, options):
ml_model_id = options.ml_model_id
test_datasource_id = options.test_datasource_id
print >> sys.stderr, "Generating batch predictions with model {} and datasource {} => {}\n".format(ml_model_id, test_datasource_id, output_uri_s3)
print >> sys.stderr, "This may take a few minutes, please, wait ...\n"
bucket, key = complete_batch_prediction(ml_model_id, test_datasource_id, output_uri_s3)
else:
batch_prediction_id = options.batch_prediction_id
bucket, key = batch_prediction_data_bucket_key(output_uri_s3, batch_prediction_id)
print >> sys.stderr, "Reading prediction data from s3://{}/{}\n".format(bucket, key)
test_predictions = read_test_predictions(bucket, key)
test_predictions = np.sort(test_predictions, order='score')
# print test_predictions
# print "predictions data shape = {}\n".format(test_predictions.shape)
score_n_true_label = np.array([(e2, int(e1)) for e1, e2 in test_predictions])
# print score_n_true_label
plot_class_histograms(score_n_true_label)
best_threshold, lowest_cost, threshold_costs = find_optimal_threshold(score_n_true_label, costs)
print "best_threshold = {}, lowest cost = {}\n".format(best_threshold, lowest_cost)
plot_threshold_costs(threshold_costs, best_threshold, lowest_cost)
plt.show()
return threshold_costs