colab-enterprise/BigQuery-Demo-Notebook.ipynb (336 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "0d103b57-5ebe-4d1c-a9c6-336883236262",
"metadata": {},
"source": [
"### Use Cases:\n",
" - Data Scientists need access to data in your data warehouse. BigQuery makes this extremely easy.\n",
"\n",
"### Description: \n",
" - BigQuery provides a magic keyword ```%%bigquery``` used for executing queries to view the results or to populate a dataframe.\n",
" - You can easily save dataframes back to BigQuery.\n",
"\n",
"### Reference:\n",
" - https://cloud.google.com/bigquery/docs/visualize-jupyter\n",
"\n",
"### Clean up / Reset script:\n",
" n/a \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c71dd8f-215b-4113-a28c-29eef0c4c304",
"metadata": {},
"outputs": [],
"source": [
"from google.cloud import bigquery\n",
"import pandas\n",
"client = bigquery.Client()"
]
},
{
"cell_type": "markdown",
"id": "8659aecf-1843-48aa-9829-9bfc3ed82b24",
"metadata": {},
"source": [
"### Write any SQL and execute in BigQuery"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcb657be-7f49-4bab-9ef5-8a0628b0191b",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery\n",
"WITH MonthlyData AS\n",
"(\n",
"SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n",
" FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n",
" CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n",
" WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n",
" WHEN taxi_trips.Payment_Type_Id = 3 THEN 'NoCharge'\n",
" WHEN taxi_trips.Payment_Type_Id = 4 THEN 'Dispute'\n",
" END AS PaymentDescription,\n",
" SUM(taxi_trips.Total_Amount) AS Total_Amount\n",
" FROM `taxi_dataset.taxi_trips` AS taxi_trips\n",
" WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n",
" AND Passenger_Count IS NOT NULL\n",
" AND Payment_Type_Id IN (1,2,3,4)\n",
" GROUP BY 1, 2, 3 \n",
")\n",
"SELECT MonthName,\n",
" FORMAT(\"%'d\", CAST(Credit AS INTEGER)) AS Credit,\n",
" FORMAT(\"%'d\", CAST(Cash AS INTEGER)) AS Cash,\n",
" FORMAT(\"%'d\", CAST(NoCharge AS INTEGER)) AS NoCharge,\n",
" FORMAT(\"%'d\", CAST(Dispute AS INTEGER)) AS Dispute\n",
" FROM MonthlyData\n",
" PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))\n",
"ORDER BY MonthNumber;"
]
},
{
"cell_type": "markdown",
"id": "576d7699-8915-4cfd-a9fa-97f68f5bfd73",
"metadata": {},
"source": [
"### Place your SQL Results directly into a Dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ab26870-b67f-49f8-bf4b-fc18bfbd96e6",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery myDataframe\n",
"WITH MonthlyData AS\n",
"(\n",
"SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n",
" FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n",
" CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n",
" WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n",
" WHEN taxi_trips.Payment_Type_Id = 3 THEN 'NoCharge'\n",
" WHEN taxi_trips.Payment_Type_Id = 4 THEN 'Dispute'\n",
" END AS PaymentDescription,\n",
" SUM(taxi_trips.Total_Amount) AS Total_Amount\n",
" FROM `taxi_dataset.taxi_trips` AS taxi_trips\n",
" WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n",
" AND Passenger_Count IS NOT NULL\n",
" AND Payment_Type_Id IN (1,2,3,4)\n",
" GROUP BY 1, 2, 3 \n",
")\n",
"SELECT MonthName,\n",
" CAST(Credit AS INTEGER) AS Credit,\n",
" CAST(Cash AS INTEGER) AS Cash,\n",
" CAST(NoCharge AS INTEGER) AS NoCharge,\n",
" CAST(Dispute AS INTEGER) AS Dispute\n",
" FROM MonthlyData\n",
" PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))\n",
"ORDER BY MonthNumber;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26ce7922-629f-4874-9eed-50cbefe2c517",
"metadata": {},
"outputs": [],
"source": [
"# Show the results\n",
"display(myDataframe)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fe6f08c-6bfe-4536-b733-5bdd9589cb00",
"metadata": {},
"outputs": [],
"source": [
"# Loop through the dataframe\n",
"# You can now use BigQuery data just like any other datasource\n",
"for index, row in myDataframe.iterrows():\n",
" print((\"MonthName: {MonthName} | Credit: {Credit} | Cash {Cash}\").format(MonthName=row['MonthName'].ljust(10),Credit=str(row['Credit']).ljust(10),Cash=str(row['Cash']).ljust(10)))"
]
},
{
"cell_type": "markdown",
"id": "c176607b-6aea-4ace-8e33-9399cecb4059",
"metadata": {},
"source": [
"### Easy to query BigQuery with Adhoc SQL and place the results into a Dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba777f4d-d09a-4da2-b934-7f8ef5c8d48f",
"metadata": {},
"outputs": [],
"source": [
"sql=\"SELECT Payment_Type_Id, Payment_Type_Description FROM `taxi_dataset.payment_type`\";\n",
"\n",
"df_payment_type = client.query(sql).to_dataframe()\n",
"\n",
"display(df_payment_type);\n"
]
},
{
"cell_type": "markdown",
"id": "80fdb550-de0e-4de3-8fcf-16b886bfc53e",
"metadata": {},
"source": [
"### Load data in your notebook into a BigQuery table"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39da97b2-09af-4b9b-ae4d-5ab6dc6eae9c",
"metadata": {},
"outputs": [],
"source": [
"# Load a table in BQ with results from your notebook\n",
"\n",
"table_id = \"taxi_dataset.notebook_monthly_data\"\n",
"\n",
"dataframe = pandas.DataFrame(\n",
" myDataframe, # Your source data\n",
" columns=[\n",
" \"MonthName\",\n",
" \"Credit\",\n",
" \"Cash\",\n",
" \"NoCharge\",\n",
" \"Dispute\"\n",
" ],\n",
")\n",
"\n",
"job_config = bigquery.LoadJobConfig(\n",
" schema=[\n",
" bigquery.SchemaField(\"MonthName\", bigquery.enums.SqlTypeNames.STRING),\n",
" bigquery.SchemaField(\"Credit\", bigquery.enums.SqlTypeNames.FLOAT64),\n",
" bigquery.SchemaField(\"Cash\", bigquery.enums.SqlTypeNames.FLOAT64),\n",
" bigquery.SchemaField(\"NoCharge\", bigquery.enums.SqlTypeNames.FLOAT64),\n",
" bigquery.SchemaField(\"Dispute\", bigquery.enums.SqlTypeNames.FLOAT64)\n",
" ],\n",
" write_disposition=\"WRITE_TRUNCATE\",\n",
")\n",
"\n",
"job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config) \n",
"job.result() # Wait for the job to complete.\n",
"\n",
"table = client.get_table(table_id) # Make an API request.\n",
"print(\"Loaded {} rows and {} columns to {}\".format(table.num_rows, len(table.schema), table_id))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5590c59-49ea-4bcc-a531-008201842b37",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery\n",
"SELECT * FROM `taxi_dataset.notebook_monthly_data`;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4469b111-ba52-4f15-9f97-91c774534d22",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery\n",
"DROP TABLE `taxi_dataset.notebook_monthly_data`;"
]
},
{
"cell_type": "markdown",
"id": "55fa993d-36d1-413a-b34b-58a66958d672",
"metadata": {},
"source": [
"### Show BigQuery results in a chart"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ebb7386-6e27-465e-a6b6-f8f5dacf3392",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "776d7da0-7199-49ca-bfd0-bf717bf119ea",
"metadata": {},
"outputs": [],
"source": [
"%%bigquery myDataframe\n",
"WITH MonthlyData AS\n",
"(\n",
"SELECT FORMAT_DATE(\"%B\", taxi_trips.Pickup_DateTime) AS MonthName,\n",
" FORMAT_DATE(\"%m\", taxi_trips.Pickup_DateTime) AS MonthNumber,\n",
" CASE WHEN taxi_trips.Payment_Type_Id = 1 THEN 'Credit'\n",
" WHEN taxi_trips.Payment_Type_Id = 2 THEN 'Cash'\n",
" ELSE 'Other'\n",
" END AS PaymentDescription,\n",
" SUM(taxi_trips.Total_Amount) AS Total_Amount\n",
" FROM `taxi_dataset.taxi_trips` AS taxi_trips\n",
" WHERE taxi_trips.Pickup_DateTime BETWEEN '2020-01-01' AND '2020-12-31' \n",
" AND Passenger_Count IS NOT NULL\n",
" AND Payment_Type_Id IN (1,2,3,4)\n",
" GROUP BY 1, 2, 3 \n",
")\n",
"SELECT MonthName,\n",
" CAST(Credit AS INTEGER) / 1000 AS Credit,\n",
" CAST(Cash AS INTEGER) / 1000 AS Cash,\n",
" CAST(Other AS INTEGER) / 1000 AS Other\n",
" FROM MonthlyData\n",
" PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'Other'))\n",
"ORDER BY MonthNumber;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "606af83d-1d33-468e-9e3f-1a3b080970c1",
"metadata": {},
"outputs": [],
"source": [
"display(myDataframe)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2abf6a3a-be6c-4932-b9ed-6561c5a9bdf3",
"metadata": {},
"outputs": [],
"source": [
"ax = myDataframe.plot(kind=\"bar\", x=\"MonthName\", y=[\"Cash\",\"Credit\",\"Other\"], figsize=(15, 7))\n",
"ax.set_title(\"Monthly Amounts by Payment Type\")\n",
"ax.set_xlabel(\"Month\")\n",
"ax.set_ylabel(\"Amount (thousands)\")\n",
"ax.set_ylim([0, 100000])"
]
}
],
"metadata": {
"environment": {
"kernel": "python3",
"name": "managed-notebooks.m87",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform/release.spark-cpu:spark-kernel"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}