tooling/enrichment/main.py

# # Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from google.cloud import bigquery, storage import pandas as pd import os from dotenv import load_dotenv import time from imagen_client import generate_image from gemini_client import get_image_description from firestore_client import FirestoreClient # Load environment variables load_dotenv() # Get environment variables PROJECT_ID = os.getenv('project_id') DATASET = os.getenv('bq_dataset') TABLE = os.getenv('bq_table') FIRESTORE_COLLECTION = os.getenv('firestore_collection') BATCH_SIZE = 50 # Process 50 rows at a time # Initialize Firestore client firestore_client = FirestoreClient(PROJECT_ID, FIRESTORE_COLLECTION) def fetch_bigquery_data(last_processed_id): client = bigquery.Client(project=PROJECT_ID) query = f""" SELECT * FROM `{PROJECT_ID}.{DATASET}.{TABLE}` WHERE id > {last_processed_id} ORDER BY id LIMIT {BATCH_SIZE} """ try: df = client.query(query).to_dataframe() print(f"\nProcessing {len(df)} rows starting from ID: {last_processed_id}") return df except Exception as e: print(f"Error occurred: {str(e)}") return None def upload_to_gcs(image, filename): """Upload generated image to Google Cloud Storage bucket.""" storage_client = storage.Client() bucket = storage_client.bucket(os.getenv('psearch_img_bucket')) blob = bucket.blob(filename) blob.upload_from_string(image._image_bytes) return f"gs://{os.getenv('psearch_img_bucket')}/{filename}" def process_single_product(row): """Process a single product and return the result.""" product_id = row['id'] row_data = row.to_dict() try: # Start processing and mark in Firestore firestore_client.start_product_processing(product_id, row_data) # Generate safe filename using product ID safe_filename = f"product_{product_id}.png" # Generate image using Imagen client image = generate_image(row, PROJECT_ID) if not image: raise Exception("Failed to generate image") # Get image description using Gemini client - Updated to pass product_data description = get_image_description(image._image_bytes, PROJECT_ID, row_data) if not description: raise Exception("Failed to generate description") # Upload to GCS and get URI image_uri = upload_to_gcs(image, safe_filename) # Mark as completed in Firestore firestore_client.complete_product_processing(product_id, image_uri, description) # Add to processed rows for CSV row_data['image_uri'] = image_uri row_data['description'] = description print(f"Generated and uploaded image for product ID {product_id}: {image_uri}") print(f"Generated description: {description[:100]}...") return row_data, None except Exception as e: error_message = f"Error processing product: {str(e)}" print(error_message) firestore_client.mark_product_failed(product_id, error_message) return None, error_message def process_products(): processed_rows = [] total_processed = 0 last_id = firestore_client.get_last_processed_id() # First, try to process any previously failed items print("Checking for failed products to retry...") failed_products = firestore_client.get_failed_products() for failed_product in failed_products: product_data = failed_product['product_data'] row = pd.Series(product_data) result, error = process_single_product(row) if result: processed_rows.append(result) total_processed += 1 time.sleep(1) # Rate limiting # Now process new items while total_processed < 30000: df = fetch_bigquery_data(last_id) if df is None or df.empty: print("No more rows to process") break for index, row in df.iterrows(): product_id = row['id'] # Skip if product already processed if firestore_client.is_product_processed(product_id): print(f"Product {product_id} already processed, skipping...") last_id = product_id continue result, error = process_single_product(row) if result: processed_rows.append(result) total_processed += 1 # Update last processed ID last_id = product_id firestore_client.update_last_processed_id(last_id) # Add some delay to avoid rate limiting time.sleep(1) print(f"Processed {total_processed} products so far") # Create final DataFrame and export to CSV if processed_rows: final_df = pd.DataFrame(processed_rows) output_filename = f"processed_products_{int(time.time())}.csv" final_df.to_csv(output_filename, index=False) print(f"Exported results to {output_filename}") # Upload CSV to GCS storage_client = storage.Client() bucket = storage_client.bucket(os.getenv('psearch_img_bucket')) blob = bucket.blob(f"exports/{output_filename}") blob.upload_from_filename(output_filename) print(f"Uploaded CSV to gs://{os.getenv('psearch_img_bucket')}/exports/{output_filename}") if __name__ == "__main__": process_products()

tooling/enrichment/main.py (100 lines of code) (raw):