colab-enterprise/BigQuery-Create-TensorFlow-Model.ipynb (604 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "55adf2ae-62d8-43c2-a059-56dfdc3655d4",
"metadata": {},
"source": [
"### IMPORTANT NOTE: \n",
"- You need to restart the kernel under Kernel | Restart Kernel after installing TensorFlow\n",
"- Run the first cell ```!pip install -q tensorflow --user``` and then restart the kernel\n",
"- You can then run the entire notebook under Run | Run All Cells\n",
" "
]
},
{
"cell_type": "markdown",
"id": "f2583eee-63fb-494c-a7f8-c7c715e8cf72",
"metadata": {},
"source": [
"### Use Cases:\n",
" - Do you have the need to create custom models in Tensorflow and score your data in BigQuery\n",
" - BigQuery has BQML, but this might not cover all of your ML needs\n",
" - You can create a custom model and then import the model into BigQuery and use it for scoring\n",
"\n",
"### Note:\n",
" - These models are samples and are not trained for accuracy or precission\n",
" \n",
"### Description: \n",
" - Query data in BigQuery\n",
" - Create a Linear Regression model\n",
" - Export the model to storage\n",
" - Create a Deep Neural Network model\n",
" - Export the model to storage\n",
" - Use the BigQuery stored procedure \"sp_demo_machine_learning_import_tensorflow\" to import the models and score data\n",
"\n",
"### Reference:\n",
" - https://www.tensorflow.org/tutorials/keras/regression\n",
" - https://www.tensorflow.org/hub/exporting_tf2_saved_model\n",
" - https://cloud.google.com/bigquery-ml/docs/making-predictions-with-imported-tensorflow-models\n",
"\n",
"### Clean up / Reset script:\n",
" n/a \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93235f14-525e-4b34-bda9-bbb5b2439606",
"metadata": {},
"outputs": [],
"source": [
"# Remove prior saved models\n",
"!gsutil rm -r gs://${bucket_name}/tensorflow"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e3fd39d-a018-4e96-b3fd-6e17043c750f",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
"# Make NumPy printouts easier to read.\n",
"np.set_printoptions(precision=3, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f8eafac-5b03-4cd5-a041-33b3d097c6b9",
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"from tensorflow import keras\n",
"from tensorflow.keras import layers\n",
"\n",
"print(tf.__version__)"
]
},
{
"cell_type": "markdown",
"id": "db052821-f5ff-4dbe-9554-b097a854414d",
"metadata": {},
"source": [
"### Predict the fare amount based upon time (minutes) and distance"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d68b4d4b-c61f-46d7-a5b4-7586e53b8a6c",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery df_taxi_trips\n",
"SELECT Fare_Amount,\n",
" Trip_Distance,\n",
" CAST(DATETIME_DIFF(Dropoff_DateTime, Pickup_DateTime, MINUTE) AS FLOAT64) AS Minutes\n",
" FROM `${project_id}.taxi_dataset.taxi_trips`\n",
" WHERE Pickup_DateTime BETWEEN '2020-01-01' AND '2020-01-31' -- Small dataset\n",
" AND DATETIME_DIFF(Dropoff_DateTime, Pickup_DateTime, MINUTE) BETWEEN 5 AND 60 -- Somewhat of a normal time\n",
" AND Fare_Amount > 0\n",
" AND Trip_Distance > 0\n",
"LIMIT 10000; -- not too many so we are not here all day"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbfcc099-ed99-4a70-a6a7-1b6fa5b1f956",
"metadata": {},
"outputs": [],
"source": [
"df_taxi_trips.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4d5d83f-4505-42d9-99e5-a0697cc3c6e4",
"metadata": {},
"outputs": [],
"source": [
"train_dataset = df_taxi_trips.sample(frac=0.8, random_state=0)\n",
"test_dataset = df_taxi_trips.drop(train_dataset.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f862b32-1035-4831-9b98-405c7934877f",
"metadata": {},
"outputs": [],
"source": [
"sns.pairplot(train_dataset[['Fare_Amount', 'Trip_Distance', 'Minutes']], diag_kind='kde')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b14cedd8-bf44-47c4-953f-1349ddfe27b7",
"metadata": {},
"outputs": [],
"source": [
"train_dataset.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45728d03-9148-40c2-8580-c31faa754158",
"metadata": {},
"outputs": [],
"source": [
"train_features = train_dataset.copy()\n",
"test_features = test_dataset.copy()\n",
"\n",
"train_labels = train_features.pop('Fare_Amount')\n",
"test_labels = test_features.pop('Fare_Amount')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "139a5576-bc97-42f4-971f-bae15842bcf1",
"metadata": {},
"outputs": [],
"source": [
"train_dataset.describe().transpose()[['mean', 'std']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10d70c55-ce64-413e-a4ee-7d1175f9b9ec",
"metadata": {},
"outputs": [],
"source": [
"normalizer = tf.keras.layers.Normalization(axis=-1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc780196-937b-42d1-901c-4097de0b560d",
"metadata": {},
"outputs": [],
"source": [
"normalizer.adapt(np.array(train_features))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48bbbbb1",
"metadata": {},
"outputs": [],
"source": [
"tf.keras.layers.Normalization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "197aa607-3fcb-40f4-8815-2bdeaa27be45",
"metadata": {},
"outputs": [],
"source": [
"print(normalizer.mean.numpy())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de38f1b5-85b9-41ef-82ae-e549fd12850a",
"metadata": {},
"outputs": [],
"source": [
"taxi_fare_model = tf.keras.Sequential([\n",
" normalizer,\n",
" layers.Dense(units=1)\n",
"])\n",
"\n",
"taxi_fare_model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e7b1cf4-5aac-46f6-906d-cd06c6557b0b",
"metadata": {},
"outputs": [],
"source": [
"taxi_fare_model.predict(train_features[:10])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad801c10-4d01-4aa2-8353-471c8a404ddd",
"metadata": {},
"outputs": [],
"source": [
"taxi_fare_model.layers[1].kernel"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8aca79dc-cf76-4263-b513-e6732afea6d6",
"metadata": {},
"outputs": [],
"source": [
"taxi_fare_model.compile(\n",
" optimizer=tf.optimizers.Adam(learning_rate=0.1),\n",
" loss='mean_absolute_error')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c94b2871-770d-4dd2-a6eb-b1fb03b14005",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"history = taxi_fare_model.fit(\n",
" train_features,\n",
" train_labels,\n",
" epochs=100,\n",
" # Suppress logging.\n",
" verbose=0,\n",
" # Calculate validation results on 20% of the training data.\n",
" validation_split = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5c2dd17-eb89-4455-ae5e-e9fdde61bf47",
"metadata": {},
"outputs": [],
"source": [
"hist = pd.DataFrame(history.history)\n",
"hist['epoch'] = history.epoch\n",
"hist.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e5f1b04-39f2-4f6d-8464-55bdcc7faf9e",
"metadata": {},
"outputs": [],
"source": [
"def plot_loss(history):\n",
" plt.plot(history.history['loss'], label='loss')\n",
" plt.plot(history.history['val_loss'], label='val_loss')\n",
" plt.ylim([0, 10])\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Error [Fare_Amount]')\n",
" plt.legend()\n",
" plt.grid(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e3a86d3-b98a-41c3-800b-74ccc329ba86",
"metadata": {},
"outputs": [],
"source": [
"plot_loss(history)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b10a79f-a663-41b1-b780-f5d15d555dec",
"metadata": {},
"outputs": [],
"source": [
"test_results = {}\n",
"\n",
"test_results['taxi_fare_model'] = taxi_fare_model.evaluate(\n",
" test_features,\n",
" test_labels, verbose=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56038799-16c6-445c-869f-8eb07d503ab5",
"metadata": {},
"outputs": [],
"source": [
"# BigQuery: Run your model in BigQuery and here to see that they are the same\n",
"\n",
"# Create a TensorFlow tensor with the correct shape\n",
"input_data = tf.constant([[10, 20]], dtype=tf.float32) # Adjust dtype if needed\n",
"\n",
"prediction = taxi_fare_model.predict(input_data)\n",
"print(prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3e48ea6-96e5-4dc7-a719-3162df79bc66",
"metadata": {},
"outputs": [],
"source": [
"# Generated test data\n",
"x = 10 * np.random.random_sample((100, 2)) \n",
"y = taxi_fare_model.predict(x,)\n",
"#print(x)\n",
"#print(y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86a742e0-07fa-4f5e-9210-c053bc9ede2b",
"metadata": {},
"outputs": [],
"source": [
"from mpl_toolkits import mplot3d"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85b0a814-fbe8-43b5-b5e8-35dfa2fc9164",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28a07885-b511-453d-80b4-33e028bf52c9",
"metadata": {},
"outputs": [],
"source": [
"# Set size, this works in notebooks\n",
"plt.rcParams['figure.figsize'] = [12, 12]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87015c2d-b014-4555-925b-4e9f22606762",
"metadata": {},
"outputs": [],
"source": [
"ax = plt.axes(projection='3d')\n",
"ax.set_xlabel('Distance')\n",
"ax.set_ylabel('Time (minutes)')\n",
"ax.set_zlabel('Fare Amount');\n",
"ax.set_title('Fare Predictions');\n",
"\n",
"# Data for three-dimensional scattered points\n",
"zdata = y[:,0]\n",
"xdata = x[:,0]\n",
"ydata = x[:,1]\n",
"ax.scatter3D(xdata, ydata, zdata, c=zdata, s=50, cmap='Blues')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "927c3f4f-156c-4e69-9218-b1a240721644",
"metadata": {},
"outputs": [],
"source": [
"taxi_fare_model.save('gs://${bucket_name}/tensorflow/taxi_fare_model/linear_regression')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66cffa67-ffae-4c82-b806-10f871826e85",
"metadata": {},
"outputs": [],
"source": [
"def build_and_compile_model(norm):\n",
" model = keras.Sequential([\n",
" norm,\n",
" layers.Dense(64, activation='relu'),\n",
" layers.Dense(64, activation='relu'),\n",
" layers.Dense(1)\n",
" ])\n",
"\n",
" model.compile(loss='mean_absolute_error',\n",
" optimizer=tf.keras.optimizers.Adam(0.001))\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87708bb2-0cc1-4ecf-ac1d-1e0bae9d64e9",
"metadata": {},
"outputs": [],
"source": [
"dnn_model = build_and_compile_model(normalizer)\n",
"dnn_model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8865de8-0358-4f6c-bc41-e9f49484b28b",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"history = dnn_model.fit(\n",
" train_features,\n",
" train_labels,\n",
" validation_split=0.2,\n",
" verbose=0, epochs=100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f27305a2-81c0-4a04-8841-2f3e5048c762",
"metadata": {},
"outputs": [],
"source": [
"# Set size, this works in notebooks\n",
"plt.rcParams['figure.figsize'] = [6, 6]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef701308-ed90-4bab-a953-c8bda1628c9e",
"metadata": {},
"outputs": [],
"source": [
"plot_loss(history)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9c68117-8971-4dfa-a3eb-2b7e4218535b",
"metadata": {},
"outputs": [],
"source": [
"test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c967b19-581a-4232-a260-9d587c5b69cb",
"metadata": {},
"outputs": [],
"source": [
"test_predictions = dnn_model.predict(test_features).flatten()\n",
"\n",
"a = plt.axes(aspect='equal')\n",
"plt.scatter(test_labels, test_predictions)\n",
"plt.xlabel('True Values [MPG]')\n",
"plt.ylabel('Predictions [MPG]')\n",
"lims = [0, 50]\n",
"plt.xlim(lims)\n",
"plt.ylim(lims)\n",
"_ = plt.plot(lims, lims)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "865f03d9-a0fa-42d9-9a1b-7f86a3e89c48",
"metadata": {},
"outputs": [],
"source": [
"error = test_predictions - test_labels\n",
"plt.hist(error, bins=25)\n",
"plt.xlabel('Prediction Error [Fare_Amount]')\n",
"_ = plt.ylabel('Count')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b481882-254e-426c-95e7-b233f75c05d2",
"metadata": {},
"outputs": [],
"source": [
"# BigQuery: Run your model in BigQuery and here to see that they are the same\n",
"input_data = tf.constant([[10.0,20.0]], dtype=tf.float32) # Adjust dtype if needed\n",
"\n",
"prediction = taxi_fare_model.predict(input_data)\n",
"print(prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abfe90fa-9a0e-49e6-88b5-6e115de0a845",
"metadata": {},
"outputs": [],
"source": [
"input_data = tf.constant([[10.0,20.0], [5,6] ], dtype=tf.float32) # Adjust dtype if needed\n",
"\n",
"prediction = taxi_fare_model.predict(input_data)\n",
"print(prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82645f58-d112-40bc-bc23-bf6e8ea00a82",
"metadata": {},
"outputs": [],
"source": [
"dnn_model.save('gs://${bucket_name}/tensorflow/taxi_fare_model/dnn')"
]
}
],
"metadata": {
"environment": {
"kernel": "python3",
"name": "managed-notebooks.m87",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform/release.spark-cpu:spark-kernel"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}