colab-enterprise/BigQuery-Create-TensorFlow-Model.ipynb (604 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "55adf2ae-62d8-43c2-a059-56dfdc3655d4", "metadata": {}, "source": [ "### IMPORTANT NOTE: \n", "- You need to restart the kernel under Kernel | Restart Kernel after installing TensorFlow\n", "- Run the first cell ```!pip install -q tensorflow --user``` and then restart the kernel\n", "- You can then run the entire notebook under Run | Run All Cells\n", " " ] }, { "cell_type": "markdown", "id": "f2583eee-63fb-494c-a7f8-c7c715e8cf72", "metadata": {}, "source": [ "### Use Cases:\n", " - Do you have the need to create custom models in Tensorflow and score your data in BigQuery\n", " - BigQuery has BQML, but this might not cover all of your ML needs\n", " - You can create a custom model and then import the model into BigQuery and use it for scoring\n", "\n", "### Note:\n", " - These models are samples and are not trained for accuracy or precission\n", " \n", "### Description: \n", " - Query data in BigQuery\n", " - Create a Linear Regression model\n", " - Export the model to storage\n", " - Create a Deep Neural Network model\n", " - Export the model to storage\n", " - Use the BigQuery stored procedure \"sp_demo_machine_learning_import_tensorflow\" to import the models and score data\n", "\n", "### Reference:\n", " - https://www.tensorflow.org/tutorials/keras/regression\n", " - https://www.tensorflow.org/hub/exporting_tf2_saved_model\n", " - https://cloud.google.com/bigquery-ml/docs/making-predictions-with-imported-tensorflow-models\n", "\n", "### Clean up / Reset script:\n", " n/a \n" ] }, { "cell_type": "code", "execution_count": null, "id": "93235f14-525e-4b34-bda9-bbb5b2439606", "metadata": {}, "outputs": [], "source": [ "# Remove prior saved models\n", "!gsutil rm -r gs://${bucket_name}/tensorflow" ] }, { "cell_type": "code", "execution_count": null, "id": "4e3fd39d-a018-4e96-b3fd-6e17043c750f", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "\n", "# Make NumPy printouts easier to read.\n", "np.set_printoptions(precision=3, suppress=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "7f8eafac-5b03-4cd5-a041-33b3d097c6b9", "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "\n", "print(tf.__version__)" ] }, { "cell_type": "markdown", "id": "db052821-f5ff-4dbe-9554-b097a854414d", "metadata": {}, "source": [ "### Predict the fare amount based upon time (minutes) and distance" ] }, { "cell_type": "code", "execution_count": null, "id": "d68b4d4b-c61f-46d7-a5b4-7586e53b8a6c", "metadata": {}, "outputs": [], "source": [ "%%bigquery df_taxi_trips\n", "SELECT Fare_Amount,\n", " Trip_Distance,\n", " CAST(DATETIME_DIFF(Dropoff_DateTime, Pickup_DateTime, MINUTE) AS FLOAT64) AS Minutes\n", " FROM `${project_id}.taxi_dataset.taxi_trips`\n", " WHERE Pickup_DateTime BETWEEN '2020-01-01' AND '2020-01-31' -- Small dataset\n", " AND DATETIME_DIFF(Dropoff_DateTime, Pickup_DateTime, MINUTE) BETWEEN 5 AND 60 -- Somewhat of a normal time\n", " AND Fare_Amount > 0\n", " AND Trip_Distance > 0\n", "LIMIT 10000; -- not too many so we are not here all day" ] }, { "cell_type": "code", "execution_count": null, "id": "cbfcc099-ed99-4a70-a6a7-1b6fa5b1f956", "metadata": {}, "outputs": [], "source": [ "df_taxi_trips.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "d4d5d83f-4505-42d9-99e5-a0697cc3c6e4", "metadata": {}, "outputs": [], "source": [ "train_dataset = df_taxi_trips.sample(frac=0.8, random_state=0)\n", "test_dataset = df_taxi_trips.drop(train_dataset.index)" ] }, { "cell_type": "code", "execution_count": null, "id": "7f862b32-1035-4831-9b98-405c7934877f", "metadata": {}, "outputs": [], "source": [ "sns.pairplot(train_dataset[['Fare_Amount', 'Trip_Distance', 'Minutes']], diag_kind='kde')" ] }, { "cell_type": "code", "execution_count": null, "id": "b14cedd8-bf44-47c4-953f-1349ddfe27b7", "metadata": {}, "outputs": [], "source": [ "train_dataset.describe().transpose()" ] }, { "cell_type": "code", "execution_count": null, "id": "45728d03-9148-40c2-8580-c31faa754158", "metadata": {}, "outputs": [], "source": [ "train_features = train_dataset.copy()\n", "test_features = test_dataset.copy()\n", "\n", "train_labels = train_features.pop('Fare_Amount')\n", "test_labels = test_features.pop('Fare_Amount')" ] }, { "cell_type": "code", "execution_count": null, "id": "139a5576-bc97-42f4-971f-bae15842bcf1", "metadata": {}, "outputs": [], "source": [ "train_dataset.describe().transpose()[['mean', 'std']]" ] }, { "cell_type": "code", "execution_count": null, "id": "10d70c55-ce64-413e-a4ee-7d1175f9b9ec", "metadata": {}, "outputs": [], "source": [ "normalizer = tf.keras.layers.Normalization(axis=-1)" ] }, { "cell_type": "code", "execution_count": null, "id": "bc780196-937b-42d1-901c-4097de0b560d", "metadata": {}, "outputs": [], "source": [ "normalizer.adapt(np.array(train_features))" ] }, { "cell_type": "code", "execution_count": null, "id": "48bbbbb1", "metadata": {}, "outputs": [], "source": [ "tf.keras.layers.Normalization" ] }, { "cell_type": "code", "execution_count": null, "id": "197aa607-3fcb-40f4-8815-2bdeaa27be45", "metadata": {}, "outputs": [], "source": [ "print(normalizer.mean.numpy())" ] }, { "cell_type": "code", "execution_count": null, "id": "de38f1b5-85b9-41ef-82ae-e549fd12850a", "metadata": {}, "outputs": [], "source": [ "taxi_fare_model = tf.keras.Sequential([\n", " normalizer,\n", " layers.Dense(units=1)\n", "])\n", "\n", "taxi_fare_model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "8e7b1cf4-5aac-46f6-906d-cd06c6557b0b", "metadata": {}, "outputs": [], "source": [ "taxi_fare_model.predict(train_features[:10])" ] }, { "cell_type": "code", "execution_count": null, "id": "ad801c10-4d01-4aa2-8353-471c8a404ddd", "metadata": {}, "outputs": [], "source": [ "taxi_fare_model.layers[1].kernel" ] }, { "cell_type": "code", "execution_count": null, "id": "8aca79dc-cf76-4263-b513-e6732afea6d6", "metadata": {}, "outputs": [], "source": [ "taxi_fare_model.compile(\n", " optimizer=tf.optimizers.Adam(learning_rate=0.1),\n", " loss='mean_absolute_error')" ] }, { "cell_type": "code", "execution_count": null, "id": "c94b2871-770d-4dd2-a6eb-b1fb03b14005", "metadata": {}, "outputs": [], "source": [ "%%time\n", "history = taxi_fare_model.fit(\n", " train_features,\n", " train_labels,\n", " epochs=100,\n", " # Suppress logging.\n", " verbose=0,\n", " # Calculate validation results on 20% of the training data.\n", " validation_split = 0.2)" ] }, { "cell_type": "code", "execution_count": null, "id": "a5c2dd17-eb89-4455-ae5e-e9fdde61bf47", "metadata": {}, "outputs": [], "source": [ "hist = pd.DataFrame(history.history)\n", "hist['epoch'] = history.epoch\n", "hist.tail()" ] }, { "cell_type": "code", "execution_count": null, "id": "9e5f1b04-39f2-4f6d-8464-55bdcc7faf9e", "metadata": {}, "outputs": [], "source": [ "def plot_loss(history):\n", " plt.plot(history.history['loss'], label='loss')\n", " plt.plot(history.history['val_loss'], label='val_loss')\n", " plt.ylim([0, 10])\n", " plt.xlabel('Epoch')\n", " plt.ylabel('Error [Fare_Amount]')\n", " plt.legend()\n", " plt.grid(True)" ] }, { "cell_type": "code", "execution_count": null, "id": "4e3a86d3-b98a-41c3-800b-74ccc329ba86", "metadata": {}, "outputs": [], "source": [ "plot_loss(history)" ] }, { "cell_type": "code", "execution_count": null, "id": "5b10a79f-a663-41b1-b780-f5d15d555dec", "metadata": {}, "outputs": [], "source": [ "test_results = {}\n", "\n", "test_results['taxi_fare_model'] = taxi_fare_model.evaluate(\n", " test_features,\n", " test_labels, verbose=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "56038799-16c6-445c-869f-8eb07d503ab5", "metadata": {}, "outputs": [], "source": [ "# BigQuery: Run your model in BigQuery and here to see that they are the same\n", "\n", "# Create a TensorFlow tensor with the correct shape\n", "input_data = tf.constant([[10, 20]], dtype=tf.float32) # Adjust dtype if needed\n", "\n", "prediction = taxi_fare_model.predict(input_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "id": "f3e48ea6-96e5-4dc7-a719-3162df79bc66", "metadata": {}, "outputs": [], "source": [ "# Generated test data\n", "x = 10 * np.random.random_sample((100, 2)) \n", "y = taxi_fare_model.predict(x,)\n", "#print(x)\n", "#print(y)" ] }, { "cell_type": "code", "execution_count": null, "id": "86a742e0-07fa-4f5e-9210-c053bc9ede2b", "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits import mplot3d" ] }, { "cell_type": "code", "execution_count": null, "id": "85b0a814-fbe8-43b5-b5e8-35dfa2fc9164", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "id": "28a07885-b511-453d-80b4-33e028bf52c9", "metadata": {}, "outputs": [], "source": [ "# Set size, this works in notebooks\n", "plt.rcParams['figure.figsize'] = [12, 12]" ] }, { "cell_type": "code", "execution_count": null, "id": "87015c2d-b014-4555-925b-4e9f22606762", "metadata": {}, "outputs": [], "source": [ "ax = plt.axes(projection='3d')\n", "ax.set_xlabel('Distance')\n", "ax.set_ylabel('Time (minutes)')\n", "ax.set_zlabel('Fare Amount');\n", "ax.set_title('Fare Predictions');\n", "\n", "# Data for three-dimensional scattered points\n", "zdata = y[:,0]\n", "xdata = x[:,0]\n", "ydata = x[:,1]\n", "ax.scatter3D(xdata, ydata, zdata, c=zdata, s=50, cmap='Blues')\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "927c3f4f-156c-4e69-9218-b1a240721644", "metadata": {}, "outputs": [], "source": [ "taxi_fare_model.save('gs://${bucket_name}/tensorflow/taxi_fare_model/linear_regression')" ] }, { "cell_type": "code", "execution_count": null, "id": "66cffa67-ffae-4c82-b806-10f871826e85", "metadata": {}, "outputs": [], "source": [ "def build_and_compile_model(norm):\n", " model = keras.Sequential([\n", " norm,\n", " layers.Dense(64, activation='relu'),\n", " layers.Dense(64, activation='relu'),\n", " layers.Dense(1)\n", " ])\n", "\n", " model.compile(loss='mean_absolute_error',\n", " optimizer=tf.keras.optimizers.Adam(0.001))\n", " return model" ] }, { "cell_type": "code", "execution_count": null, "id": "87708bb2-0cc1-4ecf-ac1d-1e0bae9d64e9", "metadata": {}, "outputs": [], "source": [ "dnn_model = build_and_compile_model(normalizer)\n", "dnn_model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "c8865de8-0358-4f6c-bc41-e9f49484b28b", "metadata": {}, "outputs": [], "source": [ "%%time\n", "history = dnn_model.fit(\n", " train_features,\n", " train_labels,\n", " validation_split=0.2,\n", " verbose=0, epochs=100)" ] }, { "cell_type": "code", "execution_count": null, "id": "f27305a2-81c0-4a04-8841-2f3e5048c762", "metadata": {}, "outputs": [], "source": [ "# Set size, this works in notebooks\n", "plt.rcParams['figure.figsize'] = [6, 6]" ] }, { "cell_type": "code", "execution_count": null, "id": "ef701308-ed90-4bab-a953-c8bda1628c9e", "metadata": {}, "outputs": [], "source": [ "plot_loss(history)" ] }, { "cell_type": "code", "execution_count": null, "id": "b9c68117-8971-4dfa-a3eb-2b7e4218535b", "metadata": {}, "outputs": [], "source": [ "test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "7c967b19-581a-4232-a260-9d587c5b69cb", "metadata": {}, "outputs": [], "source": [ "test_predictions = dnn_model.predict(test_features).flatten()\n", "\n", "a = plt.axes(aspect='equal')\n", "plt.scatter(test_labels, test_predictions)\n", "plt.xlabel('True Values [MPG]')\n", "plt.ylabel('Predictions [MPG]')\n", "lims = [0, 50]\n", "plt.xlim(lims)\n", "plt.ylim(lims)\n", "_ = plt.plot(lims, lims)" ] }, { "cell_type": "code", "execution_count": null, "id": "865f03d9-a0fa-42d9-9a1b-7f86a3e89c48", "metadata": {}, "outputs": [], "source": [ "error = test_predictions - test_labels\n", "plt.hist(error, bins=25)\n", "plt.xlabel('Prediction Error [Fare_Amount]')\n", "_ = plt.ylabel('Count')" ] }, { "cell_type": "code", "execution_count": null, "id": "0b481882-254e-426c-95e7-b233f75c05d2", "metadata": {}, "outputs": [], "source": [ "# BigQuery: Run your model in BigQuery and here to see that they are the same\n", "input_data = tf.constant([[10.0,20.0]], dtype=tf.float32) # Adjust dtype if needed\n", "\n", "prediction = taxi_fare_model.predict(input_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "id": "abfe90fa-9a0e-49e6-88b5-6e115de0a845", "metadata": {}, "outputs": [], "source": [ "input_data = tf.constant([[10.0,20.0], [5,6] ], dtype=tf.float32) # Adjust dtype if needed\n", "\n", "prediction = taxi_fare_model.predict(input_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "id": "82645f58-d112-40bc-bc23-bf6e8ea00a82", "metadata": {}, "outputs": [], "source": [ "dnn_model.save('gs://${bucket_name}/tensorflow/taxi_fare_model/dnn')" ] } ], "metadata": { "environment": { "kernel": "python3", "name": "managed-notebooks.m87", "type": "gcloud", "uri": "gcr.io/deeplearning-platform/release.spark-cpu:spark-kernel" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }