colab-enterprise/BigQuery-Demo-Notebook.ipynb (336 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "0d103b57-5ebe-4d1c-a9c6-336883236262", "metadata": {}, "source": [ "### Use Cases:\n", " - Data Scientists need access to data in your data warehouse. BigQuery makes this extremely easy.\n", "\n", "### Description: \n", " - BigQuery provides a magic keyword ```%%bigquery``` used for executing queries to view the results or to populate a dataframe.\n", " - You can easily save dataframes back to BigQuery.\n", "\n", "### Reference:\n", " - https://cloud.google.com/bigquery/docs/visualize-jupyter\n", "\n", "### Clean up / Reset script:\n", " n/a \n" ] }, { "cell_type": "code", "execution_count": null, "id": "3c71dd8f-215b-4113-a28c-29eef0c4c304", "metadata": {}, "outputs": [], "source": [ "from google.cloud import bigquery\n", "import pandas\n", "client = bigquery.Client()" ] }, { "cell_type": "markdown", "id": "8659aecf-1843-48aa-9829-9bfc3ed82b24", "metadata": {}, "source": [ "### Write any SQL and execute in BigQuery" ] }, { "cell_type": "code", "execution_count": null, "id": "fcb657be-7f49-4bab-9ef5-8a0628b0191b", "metadata": {}, "outputs": [], "source": [ "%%bigquery\n", "WITH MonthlyData AS\n", "(\n", "SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n", " FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n", " CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n", " WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n", " WHEN taxi_trips.Payment_Type_Id = 3 THEN 'NoCharge'\n", " WHEN taxi_trips.Payment_Type_Id = 4 THEN 'Dispute'\n", " END AS PaymentDescription,\n", " SUM(taxi_trips.Total_Amount) AS Total_Amount\n", " FROM `taxi_dataset.taxi_trips` AS taxi_trips\n", " WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n", " AND Passenger_Count IS NOT NULL\n", " AND Payment_Type_Id IN (1,2,3,4)\n", " GROUP BY 1, 2, 3 \n", ")\n", "SELECT MonthName,\n", " FORMAT(\"%'d\", CAST(Credit AS INTEGER)) AS Credit,\n", " FORMAT(\"%'d\", CAST(Cash AS INTEGER)) AS Cash,\n", " FORMAT(\"%'d\", CAST(NoCharge AS INTEGER)) AS NoCharge,\n", " FORMAT(\"%'d\", CAST(Dispute AS INTEGER)) AS Dispute\n", " FROM MonthlyData\n", " PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))\n", "ORDER BY MonthNumber;" ] }, { "cell_type": "markdown", "id": "576d7699-8915-4cfd-a9fa-97f68f5bfd73", "metadata": {}, "source": [ "### Place your SQL Results directly into a Dataframe" ] }, { "cell_type": "code", "execution_count": null, "id": "5ab26870-b67f-49f8-bf4b-fc18bfbd96e6", "metadata": {}, "outputs": [], "source": [ "%%bigquery myDataframe\n", "WITH MonthlyData AS\n", "(\n", "SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n", " FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n", " CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n", " WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n", " WHEN taxi_trips.Payment_Type_Id = 3 THEN 'NoCharge'\n", " WHEN taxi_trips.Payment_Type_Id = 4 THEN 'Dispute'\n", " END AS PaymentDescription,\n", " SUM(taxi_trips.Total_Amount) AS Total_Amount\n", " FROM `taxi_dataset.taxi_trips` AS taxi_trips\n", " WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n", " AND Passenger_Count IS NOT NULL\n", " AND Payment_Type_Id IN (1,2,3,4)\n", " GROUP BY 1, 2, 3 \n", ")\n", "SELECT MonthName,\n", " CAST(Credit AS INTEGER) AS Credit,\n", " CAST(Cash AS INTEGER) AS Cash,\n", " CAST(NoCharge AS INTEGER) AS NoCharge,\n", " CAST(Dispute AS INTEGER) AS Dispute\n", " FROM MonthlyData\n", " PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))\n", "ORDER BY MonthNumber;" ] }, { "cell_type": "code", "execution_count": null, "id": "26ce7922-629f-4874-9eed-50cbefe2c517", "metadata": {}, "outputs": [], "source": [ "# Show the results\n", "display(myDataframe)" ] }, { "cell_type": "code", "execution_count": null, "id": "9fe6f08c-6bfe-4536-b733-5bdd9589cb00", "metadata": {}, "outputs": [], "source": [ "# Loop through the dataframe\n", "# You can now use BigQuery data just like any other datasource\n", "for index, row in myDataframe.iterrows():\n", " print((\"MonthName: {MonthName} | Credit: {Credit} | Cash {Cash}\").format(MonthName=row['MonthName'].ljust(10),Credit=str(row['Credit']).ljust(10),Cash=str(row['Cash']).ljust(10)))" ] }, { "cell_type": "markdown", "id": "c176607b-6aea-4ace-8e33-9399cecb4059", "metadata": {}, "source": [ "### Easy to query BigQuery with Adhoc SQL and place the results into a Dataframe" ] }, { "cell_type": "code", "execution_count": null, "id": "ba777f4d-d09a-4da2-b934-7f8ef5c8d48f", "metadata": {}, "outputs": [], "source": [ "sql=\"SELECT Payment_Type_Id, Payment_Type_Description FROM `taxi_dataset.payment_type`\";\n", "\n", "df_payment_type = client.query(sql).to_dataframe()\n", "\n", "display(df_payment_type);\n" ] }, { "cell_type": "markdown", "id": "80fdb550-de0e-4de3-8fcf-16b886bfc53e", "metadata": {}, "source": [ "### Load data in your notebook into a BigQuery table" ] }, { "cell_type": "code", "execution_count": null, "id": "39da97b2-09af-4b9b-ae4d-5ab6dc6eae9c", "metadata": {}, "outputs": [], "source": [ "# Load a table in BQ with results from your notebook\n", "\n", "table_id = \"taxi_dataset.notebook_monthly_data\"\n", "\n", "dataframe = pandas.DataFrame(\n", " myDataframe, # Your source data\n", " columns=[\n", " \"MonthName\",\n", " \"Credit\",\n", " \"Cash\",\n", " \"NoCharge\",\n", " \"Dispute\"\n", " ],\n", ")\n", "\n", "job_config = bigquery.LoadJobConfig(\n", " schema=[\n", " bigquery.SchemaField(\"MonthName\", bigquery.enums.SqlTypeNames.STRING),\n", " bigquery.SchemaField(\"Credit\", bigquery.enums.SqlTypeNames.FLOAT64),\n", " bigquery.SchemaField(\"Cash\", bigquery.enums.SqlTypeNames.FLOAT64),\n", " bigquery.SchemaField(\"NoCharge\", bigquery.enums.SqlTypeNames.FLOAT64),\n", " bigquery.SchemaField(\"Dispute\", bigquery.enums.SqlTypeNames.FLOAT64)\n", " ],\n", " write_disposition=\"WRITE_TRUNCATE\",\n", ")\n", "\n", "job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config) \n", "job.result() # Wait for the job to complete.\n", "\n", "table = client.get_table(table_id) # Make an API request.\n", "print(\"Loaded {} rows and {} columns to {}\".format(table.num_rows, len(table.schema), table_id))" ] }, { "cell_type": "code", "execution_count": null, "id": "b5590c59-49ea-4bcc-a531-008201842b37", "metadata": {}, "outputs": [], "source": [ "%%bigquery\n", "SELECT * FROM `taxi_dataset.notebook_monthly_data`;" ] }, { "cell_type": "code", "execution_count": null, "id": "4469b111-ba52-4f15-9f97-91c774534d22", "metadata": {}, "outputs": [], "source": [ "%%bigquery\n", "DROP TABLE `taxi_dataset.notebook_monthly_data`;" ] }, { "cell_type": "markdown", "id": "55fa993d-36d1-413a-b34b-58a66958d672", "metadata": {}, "source": [ "### Show BigQuery results in a chart" ] }, { "cell_type": "code", "execution_count": null, "id": "6ebb7386-6e27-465e-a6b6-f8f5dacf3392", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "id": "776d7da0-7199-49ca-bfd0-bf717bf119ea", "metadata": {}, "outputs": [], "source": [ "%%bigquery myDataframe\n", "WITH MonthlyData AS\n", "(\n", "SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n", " FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n", " CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n", " WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n", " ELSE 'Other'\n", " END AS PaymentDescription,\n", " SUM(taxi_trips.Total_Amount) AS Total_Amount\n", " FROM `taxi_dataset.taxi_trips` AS taxi_trips\n", " WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n", " AND Passenger_Count IS NOT NULL\n", " AND Payment_Type_Id IN (1,2,3,4)\n", " GROUP BY 1, 2, 3 \n", ")\n", "SELECT MonthName,\n", " CAST(Credit AS INTEGER) / 1000 AS Credit,\n", " CAST(Cash AS INTEGER) / 1000 AS Cash,\n", " CAST(Other AS INTEGER) / 1000 AS Other\n", " FROM MonthlyData\n", " PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'Other'))\n", "ORDER BY MonthNumber;" ] }, { "cell_type": "code", "execution_count": null, "id": "606af83d-1d33-468e-9e3f-1a3b080970c1", "metadata": {}, "outputs": [], "source": [ "display(myDataframe)" ] }, { "cell_type": "code", "execution_count": null, "id": "2abf6a3a-be6c-4932-b9ed-6561c5a9bdf3", "metadata": {}, "outputs": [], "source": [ "ax = myDataframe.plot(kind=\"bar\", x=\"MonthName\", y=[\"Cash\",\"Credit\",\"Other\"], figsize=(15, 7))\n", "ax.set_title(\"Monthly Amounts by Payment Type\")\n", "ax.set_xlabel(\"Month\")\n", "ax.set_ylabel(\"Amount (thousands)\")\n", "ax.set_ylim([0, 100000])" ] } ], "metadata": { "environment": { "kernel": "python3", "name": "managed-notebooks.m87", "type": "gcloud", "uri": "gcr.io/deeplearning-platform/release.spark-cpu:spark-kernel" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }