cli/jobs/pipelines-with-components/nyc_taxi_data_regression/train_src/train.py (72 lines of code) (raw):
import argparse
from pathlib import Path
from uuid import uuid4
from datetime import datetime
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mlflow
mlflow.sklearn.autolog()
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, help="Path to training data")
parser.add_argument("--test_data", type=str, help="Path to test data")
parser.add_argument("--model_output", type=str, help="Path of output model")
parser.add_argument("--test_split_ratio", type=float, help="ratio of train test split")
args = parser.parse_args()
print("hello training world...")
lines = [
f"Training data path: {args.training_data}",
f"Test data path: {args.test_data}",
f"Model output path: {args.model_output}",
f"Test split ratio:{args.test_split_ratio}",
]
for line in lines:
print(line)
print("mounted_path files: ")
arr = os.listdir(args.training_data)
print(arr)
df_list = []
for filename in arr:
print("reading file: %s ..." % filename)
with open(os.path.join(args.training_data, filename), "r") as handle:
# print (handle.read())
input_df = pd.read_csv((Path(args.training_data) / filename))
df_list.append(input_df)
train_data = df_list[0]
print(train_data.columns)
# Split the data into input(X) and output(y)
y = train_data["cost"]
# X = train_data.drop(['cost'], axis=1)
X = train_data[
[
"distance",
"dropoff_latitude",
"dropoff_longitude",
"passengers",
"pickup_latitude",
"pickup_longitude",
"store_forward",
"vendor",
"pickup_weekday",
"pickup_month",
"pickup_monthday",
"pickup_hour",
"pickup_minute",
"pickup_second",
"dropoff_weekday",
"dropoff_month",
"dropoff_monthday",
"dropoff_hour",
"dropoff_minute",
"dropoff_second",
]
]
# Split the data into train and test sets
trainX, testX, trainy, testy = train_test_split(
X, y, test_size=args.test_split_ratio, random_state=42
)
print(trainX.shape)
print(trainX.columns)
# Train a Linear Regression Model with the train set
model = LinearRegression().fit(trainX, trainy)
print(model.score(trainX, trainy))
mlflow.sklearn.save_model(model, args.model_output)
# test_data = pd.DataFrame(testX, columns = )
testX["cost"] = testy
print(testX.shape)
test_data = testX.to_csv(Path(args.test_data) / "test_data.csv")