def main()

in cost-based-ml/cost_based_ml.py [0:0]


def main():
	(parser, options) = parse_options()
	output_uri_s3 = options.output_uri_s3
	costs = { 'tn': options.true_neg, 'tp': options.true_pos, 'fn': options.false_neg, 'fp': options.false_pos }

	bucket, key = None, None
	if not batch_predictions_already_evaluated(parser, options):
		ml_model_id = options.ml_model_id
		test_datasource_id = options.test_datasource_id
		print >> sys.stderr, "Generating batch predictions with model {} and datasource {} => {}\n".format(ml_model_id, test_datasource_id, output_uri_s3)
		print >> sys.stderr, "This may take a few minutes, please, wait ...\n"
		bucket, key = complete_batch_prediction(ml_model_id, test_datasource_id, output_uri_s3)
	else:
		batch_prediction_id = options.batch_prediction_id
		bucket, key = batch_prediction_data_bucket_key(output_uri_s3, batch_prediction_id)

	print >> sys.stderr, "Reading prediction data from s3://{}/{}\n".format(bucket, key)

	test_predictions = read_test_predictions(bucket, key)
	test_predictions = np.sort(test_predictions, order='score')
#	print test_predictions
#	print "predictions data shape = {}\n".format(test_predictions.shape)

	score_n_true_label = np.array([(e2, int(e1)) for e1, e2 in test_predictions])
#	print score_n_true_label

	plot_class_histograms(score_n_true_label)

	best_threshold, lowest_cost, threshold_costs = find_optimal_threshold(score_n_true_label, costs)
	print "best_threshold = {}, lowest cost = {}\n".format(best_threshold, lowest_cost)
	plot_threshold_costs(threshold_costs, best_threshold, lowest_cost)

	plt.show()
	return threshold_costs