scripts/copy_select_table_column_bigquery.py (70 lines of code) (raw):
import pandas as pd
from google.cloud import bigquery
def copy_tables(project_id, source_dataset, destination_dataset, df):
client = bigquery.Client()
unique_table_names = df['TableName'].unique()
for table_name in unique_table_names:
print(f"Processing table: {table_name}")
dest_table_id = f"{project_id}.{destination_dataset}.{table_name}"
orig_table_id = f"{project_id}.{source_dataset}.{table_name}"
# Copy the table (all columns initially)
job = client.copy_table(orig_table_id, dest_table_id)
job.result()
# Get columns to preserve
columns_to_preserve = df[df['TableName'] == table_name]['ColumnName'].tolist()
# Drop unwanted columns
dest_table = client.get_table(dest_table_id)
all_columns = [c.name for c in dest_table.schema]
for column in all_columns:
print(f'Cheking if column {column} should be preserved')
if column not in columns_to_preserve:
query = f"ALTER TABLE {dest_table_id} DROP COLUMN IF EXISTS {column}"
client.query(query).result()
# Update descriptions
for column_name in columns_to_preserve:
print(f'Updating description for column {column_name}')
description = df[(df['TableName'] == table_name) & (df['ColumnName'] == column_name)]['Description'].iloc[0]
query = f"""
ALTER TABLE {dest_table_id}
ALTER COLUMN {column_name} SET OPTIONS(description='{description}')
"""
try:
client.query(query).result()
except Exception as e:
print(f"An error occurred while additon desciption to the column {column_name} in table {dest_table_id}: {e}")
def add_table_description(project_id, dataset, df):
client = bigquery.Client()
for _, row in df.iterrows():
table_name = row['TableName']
table_description = row['TableDescription']
table_id = f"{project_id}.{dataset}.{table_name}"
print(f'Updating description for Table - {table_name}')
query = f"""
ALTER TABLE {table_id} SET OPTIONS(description='{table_description}')
"""
try:
client.query(query).result()
except Exception as e:
print(f"An error occurred while additon desciption to the column {column_name} in table {table_id}: {e}")
def add_column_description(project_id, dataset, df):
client = bigquery.Client()
for _, row in df.iterrows():
table_name = row['TableName']
column_name = row['ColumnName']
column_description = row['ColumnDescription']
table_id = f"{project_id}.{dataset}.{table_name}"
print(f'Updating description for Column - {table_name}.{column_name}')
query = f"""
ALTER TABLE {table_id}
ALTER COLUMN {column_name} SET OPTIONS(description='{column_description}')
"""
try:
client.query(query).result()
except Exception as e:
print(f"An error occurred while additon desciption to the column {column_name} in table {table_id}: {e}")
if __name__ == "__main__":
import os
current_dir = os.path.dirname(__file__)
# --- Read 'TablesAndColumns' sheet from Excel ---
file_path = f"{current_dir}/tables_columns_descriptions.csv"
df = pd.read_csv(file_path)
# --- Drop rows with null values ---
df.dropna(subset=['TableName', 'ColumnName'], inplace=True)
# --- BigQuery setup ---
client = bigquery.Client()
project_id = ""
source_dataset = ""
destination_dataset = ""
# # Copy Tables
# copy_tables(project_id, source_dataset,
# destination_dataset,
# df=df)
# Add table descriptions
df_table_desc = df[['TableName', 'TableDescription']] # Select required columns
df_table_desc = df_table_desc.dropna() # Handle missing ColumnDescriptions (optional)
df_table_desc = df_table_desc.drop_duplicates(subset=['TableName', 'TableDescription']) # Drop duplicate table-column pairs
add_table_description(project_id, destination_dataset, df_table_desc)
# Add column descriptions
df_col_desc = df[['TableName', 'ColumnName', 'ColumnDescription']] # Select required columns
df_col_desc = df_col_desc.dropna() # Handle missing ColumnDescriptions (optional)
df_col_desc = df_col_desc.drop_duplicates(subset=['TableName', 'ColumnName']) # Drop duplicate table-column pairs
add_column_description(project_id, destination_dataset, df_col_desc)