cli/jobs/pipelines-with-components/nyc_taxi_data_regression/predict_src/predict.py (58 lines of code) (raw):

import argparse import pandas as pd import os from pathlib import Path from sklearn.linear_model import LinearRegression import mlflow mlflow.sklearn.autolog() parser = argparse.ArgumentParser("predict") parser.add_argument("--model_input", type=str, help="Path of input model") parser.add_argument("--test_data", type=str, help="Path to test data") parser.add_argument("--predictions", type=str, help="Path of predictions") args = parser.parse_args() print("hello scoring world...") lines = [ f"Model path: {args.model_input}", f"Test data path: {args.test_data}", f"Predictions path: {args.predictions}", ] for line in lines: print(line) # Load and split the test data print("mounted_path files: ") arr = os.listdir(args.test_data) print(arr) test_data = pd.read_csv(Path(args.test_data) / "test_data.csv") testy = test_data["cost"] # testX = test_data.drop(['cost'], axis=1) testX = test_data[ [ "distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude", "pickup_longitude", "store_forward", "vendor", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour", "pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday", "dropoff_hour", "dropoff_minute", "dropoff_second", ] ] print(testX.shape) print(testX.columns) # Load the model from input port model = mlflow.sklearn.load_model(args.model_input) # Make predictions on testX data and record them in a column named predicted_cost predictions = model.predict(testX) testX["predicted_cost"] = predictions print(testX.shape) # Compare predictions to actuals (testy) output_data = pd.DataFrame(testX) output_data["actual_cost"] = testy # Save the output data with feature columns, predicted cost, and actual cost in csv file output_data = output_data.to_csv((Path(args.predictions) / "predictions.csv"))