scripts/copy_select_table_column_bigquery.py (70 lines of code) (raw):

import pandas as pd from google.cloud import bigquery def copy_tables(project_id, source_dataset, destination_dataset, df): client = bigquery.Client() unique_table_names = df['TableName'].unique() for table_name in unique_table_names: print(f"Processing table: {table_name}") dest_table_id = f"{project_id}.{destination_dataset}.{table_name}" orig_table_id = f"{project_id}.{source_dataset}.{table_name}" # Copy the table (all columns initially) job = client.copy_table(orig_table_id, dest_table_id) job.result() # Get columns to preserve columns_to_preserve = df[df['TableName'] == table_name]['ColumnName'].tolist() # Drop unwanted columns dest_table = client.get_table(dest_table_id) all_columns = [c.name for c in dest_table.schema] for column in all_columns: print(f'Cheking if column {column} should be preserved') if column not in columns_to_preserve: query = f"ALTER TABLE {dest_table_id} DROP COLUMN IF EXISTS {column}" client.query(query).result() # Update descriptions for column_name in columns_to_preserve: print(f'Updating description for column {column_name}') description = df[(df['TableName'] == table_name) & (df['ColumnName'] == column_name)]['Description'].iloc[0] query = f""" ALTER TABLE {dest_table_id} ALTER COLUMN {column_name} SET OPTIONS(description='{description}') """ try: client.query(query).result() except Exception as e: print(f"An error occurred while additon desciption to the column {column_name} in table {dest_table_id}: {e}") def add_table_description(project_id, dataset, df): client = bigquery.Client() for _, row in df.iterrows(): table_name = row['TableName'] table_description = row['TableDescription'] table_id = f"{project_id}.{dataset}.{table_name}" print(f'Updating description for Table - {table_name}') query = f""" ALTER TABLE {table_id} SET OPTIONS(description='{table_description}') """ try: client.query(query).result() except Exception as e: print(f"An error occurred while additon desciption to the column {column_name} in table {table_id}: {e}") def add_column_description(project_id, dataset, df): client = bigquery.Client() for _, row in df.iterrows(): table_name = row['TableName'] column_name = row['ColumnName'] column_description = row['ColumnDescription'] table_id = f"{project_id}.{dataset}.{table_name}" print(f'Updating description for Column - {table_name}.{column_name}') query = f""" ALTER TABLE {table_id} ALTER COLUMN {column_name} SET OPTIONS(description='{column_description}') """ try: client.query(query).result() except Exception as e: print(f"An error occurred while additon desciption to the column {column_name} in table {table_id}: {e}") if __name__ == "__main__": import os current_dir = os.path.dirname(__file__) # --- Read 'TablesAndColumns' sheet from Excel --- file_path = f"{current_dir}/tables_columns_descriptions.csv" df = pd.read_csv(file_path) # --- Drop rows with null values --- df.dropna(subset=['TableName', 'ColumnName'], inplace=True) # --- BigQuery setup --- client = bigquery.Client() project_id = "" source_dataset = "" destination_dataset = "" # # Copy Tables # copy_tables(project_id, source_dataset, # destination_dataset, # df=df) # Add table descriptions df_table_desc = df[['TableName', 'TableDescription']] # Select required columns df_table_desc = df_table_desc.dropna() # Handle missing ColumnDescriptions (optional) df_table_desc = df_table_desc.drop_duplicates(subset=['TableName', 'TableDescription']) # Drop duplicate table-column pairs add_table_description(project_id, destination_dataset, df_table_desc) # Add column descriptions df_col_desc = df[['TableName', 'ColumnName', 'ColumnDescription']] # Select required columns df_col_desc = df_col_desc.dropna() # Handle missing ColumnDescriptions (optional) df_col_desc = df_col_desc.drop_duplicates(subset=['TableName', 'ColumnName']) # Drop duplicate table-column pairs add_column_description(project_id, destination_dataset, df_col_desc)