in notebooks/official/pipelines/Train_tabular_models_with_many_frameworks_and_import_to_Vertex_AI_using_Pipelines/Train_tabular_regression_model_using_all_frameworks_and_import_to_Vertex_AI/pipeline.py [0:0]
def train_tabular_regression_model_using_all_frameworks_pipeline():
dataset_gcs_uri = "gs://ml-pipeline-dataset/Chicago_taxi_trips/chicago_taxi_trips_2019-01-01_-_2019-02-01_limit=10000.csv"
feature_columns = ["trip_seconds", "trip_miles", "pickup_community_area", "dropoff_community_area", "fare", "tolls", "extras"] # Excluded "trip_total"
label_column = "tips"
training_set_fraction = 0.8
# Deploying the model might incur additional costs over time
deploy_model = False
all_columns = [label_column] + feature_columns
dataset = download_from_gcs_op(
gcs_path=dataset_gcs_uri
).outputs["Data"]
dataset = select_columns_using_Pandas_on_CSV_data_op(
table=dataset,
column_names=all_columns,
).outputs["transformed_table"]
dataset = fill_all_missing_values_using_Pandas_on_CSV_data_op(
table=dataset,
replacement_value="0",
# # Optional:
# column_names=None, # =[...]
).outputs["transformed_table"]
split_task = split_rows_into_subsets_op(
table=dataset,
fraction_1=training_set_fraction,
)
training_data = split_task.outputs["split_1"]
testing_data = split_task.outputs["split_2"]
# TensorFlow
tensorflow_network = create_fully_connected_tensorflow_network_op(
input_size=len(feature_columns),
# Optional:
hidden_layer_sizes=[10],
activation_name="elu",
# output_activation_name=None,
# output_size=1,
).outputs["model"]
tensorflow_model = train_model_using_Keras_on_CSV_op(
training_data=training_data,
model=tensorflow_network,
label_column_name=label_column,
# Optional:
#loss_function_name="mean_squared_error",
number_of_epochs=10,
#learning_rate=0.1,
#optimizer_name="Adadelta",
#optimizer_parameters={},
#batch_size=32,
metric_names=["mean_absolute_error"],
#random_seed=0,
).outputs["trained_model"]
tensorflow_predictions = predict_with_TensorFlow_model_on_CSV_data_op(
dataset=testing_data,
model=tensorflow_model,
# label_column_name needs to be set when doing prediction on a dataset that has labels
label_column_name=label_column,
# Optional:
# batch_size=1000,
).outputs["predictions"]
tensorflow_vertex_model_name = upload_Tensorflow_model_to_Google_Cloud_Vertex_AI_op(
model=tensorflow_model,
).outputs["model_name"]
# Deploying the model might incur additional costs over time
if deploy_model:
tensorflow_vertex_endpoint_name = deploy_model_to_endpoint_op(
model_name=tensorflow_vertex_model_name,
).outputs["endpoint_name"]
# PyTorch
pytorch_network = create_fully_connected_pytorch_network_op(
input_size=len(feature_columns),
# Optional:
hidden_layer_sizes=[10],
activation_name="elu",
# output_activation_name=None,
# output_size=1,
).outputs["model"]
pytorch_model = train_pytorch_model_from_csv_op(
model=pytorch_network,
training_data=training_data,
label_column_name=label_column,
# Optional:
#loss_function_name="mse_loss",
#number_of_epochs=1,
#learning_rate=0.1,
#optimizer_name="Adadelta",
#optimizer_parameters={},
#batch_size=32,
#batch_log_interval=100,
#random_seed=0,
).outputs["trained_model"]
pytorch_model_archive = create_pytorch_model_archive_with_base_handler_op(
model=pytorch_model,
# Optional:
# model_name="model",
# model_version="1.0",
).outputs["Model archive"]
pytorch_vertex_model_name = upload_PyTorch_model_archive_to_Google_Cloud_Vertex_AI_op(
model_archive=pytorch_model_archive,
).outputs["model_name"]
# Deploying the model might incur additional costs over time
if deploy_model:
pytorch_vertex_endpoint_name = deploy_model_to_endpoint_op(
model_name=pytorch_vertex_model_name,
).outputs["endpoint_name"]
# XGBoost
xgboost_model = train_XGBoost_model_on_CSV_op(
training_data=training_data,
label_column_name=label_column,
# Optional:
#starting_model=None,
#num_iterations=10,
#booster_params={},
#objective="reg:squarederror",
#booster="gbtree",
#learning_rate=0.3,
#min_split_loss=0,
#max_depth=6,
).outputs["model"]
# Predicting on the testing data
xgboost_predictions = xgboost_predict_on_CSV_op(
data=testing_data,
model=xgboost_model,
# label_column needs to be set when doing prediction on a dataset that has labels
label_column_name=label_column,
).outputs["predictions"]
xgboost_vertex_model_name = upload_XGBoost_model_to_Google_Cloud_Vertex_AI_op(
model=xgboost_model,
).outputs["model_name"]
# Deploying the model might incur additional costs over time
if deploy_model:
xgboost_vertex_endpoint_name = deploy_model_to_endpoint_op(
model_name=xgboost_vertex_model_name,
).outputs["endpoint_name"]
# Scikit-learn
sklearn_model = train_linear_regression_model_using_scikit_learn_from_CSV_op(
dataset=training_data,
label_column_name=label_column,
).outputs["model"]
sklearn_vertex_model_name = upload_Scikit_learn_pickle_model_to_Google_Cloud_Vertex_AI_op(
model=sklearn_model,
).outputs["model_name"]
# Deploying the model might incur additional costs over time
if deploy_model:
sklearn_vertex_endpoint_name = deploy_model_to_endpoint_op(
model_name=sklearn_vertex_model_name,
).outputs["endpoint_name"]