projects/conversational-commerce-agent/data-ingestion/food_to_retail_search.py (212 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Converting Flipkart dataset to Google Cloud Search for Retail data format. """ import argparse import csv import json import logging import os import uuid logging.basicConfig(level=logging.INFO) def update_attributes(source_obj) -> dict: """ Update site level attribute controls Args: source_obj: source product attribute controls Returns: dict: updated attribute controls """ target_obj = { "attributes" : {} } item_name = source_obj["menuItemName"].lower() item_desc = source_obj["menuItemDescription"].lower() if ("vegan" in item_name or "vegan" in item_desc): target_obj["attributes"]["Vegan"] = { "text": ["yes"], "searchable": True, "indexable": True } else: target_obj["attributes"]["Vegan"] = { "text": ["no"], "searchable": True, "indexable": True } grill_items = ["chicken", "beef", "pork", "seafood", "fish", "shrimp", "duck", "steak"] # any(item in string for item in array) if (any(item in item_name for item in grill_items) or any(item in item_desc for item in grill_items)): target_obj["attributes"]["Tags"] = { "text": ["grill"], "searchable": True, "indexable": True } if ("gluten free" in item_name.replace("-", " ") or "gluten free" in item_desc.replace("-", " ")): if target_obj["attributes"].get("Tags", None) is not None: target_obj["attributes"]["Tags"]["text"].append("gluten free") else: target_obj["attributes"]["Tags"] = { "text": ["gluten free"], "searchable": True, "indexable": True } return target_obj["attributes"] def convert_csv_to_jsonl(input_file:str, output_file:str) -> str: """ This function does a one-to-one mapping from CSV to Json Args: input_file: Path to the input CSV file. output_file: Path to the output JSONL file. Returns: Path to the output JSONL file. """ with open(input_file, "r", encoding="utf-8") as csvfile, \ open(output_file, "w", encoding="utf-8") as jsonlfile: input_data = csvfile.read().replace("\\\"", "").replace("\\", "") reader = csv.reader(input_data.split(os.linesep)) header = next(reader) for row in reader: # Escape special characters in each field escaped_row = [ value.replace("\\\"", "").replace("\\", "") for value in row ] row_dict = dict(zip(header, escaped_row)) jsonlfile.write(json.dumps(row_dict) + os.linesep) return output_file def convert_food_to_retail_search_product( input_file:str, output_file:str, project_number:str, branch:str="0") -> str: """ Transforms a Flipkart JSONL file to Google Cloud Retail Search Product Schema. Args: input_file: Path to the input Flipkart JSONL file. output_file: Path to the output JSONL file. project_number: Google Cloud Project number. branch: Retail Search Branch Id. defaults to 0 Returns: Path to the output JSONL file. """ processed_products = "" with open(input_file, "r", encoding="utf-8") as infile: with open(output_file, "w", encoding="utf-8") as outfile: for line in infile: try: source_obj = json.loads(line) target_obj = {} # Required fields target_obj["title"] = source_obj.get( "menuItemName", "Unknown Product" ) if "menuItemImageUrl" not in source_obj: logging.warning( ( "[Warning]Product doed not" "have a product url:%s" ), target_obj["title"] ) continue if target_obj["title"] in processed_products: continue else: processed_products += f"""|{target_obj["title"]}""" target_obj["categories"] = [source_obj["menuItemCategory"]] target_obj["id"] = f"{uuid.uuid4()}" target_obj["name"] = ( f"projects/{project_number}/locations/global/catalogs/" f"""default_catalog/branches/{branch}""" f"""/products/{target_obj["id"]}""" ) target_obj["primaryProductId"] = target_obj["id"] target_obj["type"] = "PRIMARY" # Assuming all are primary target_obj["description"] = source_obj.get( "menuItemDescription", target_obj["title"] ) if target_obj["description"] == target_obj["title"]: print(f"""{target_obj["title"]} has no description.""") target_desc = target_obj["description"] if len(target_desc) >= 5000: # Max description target_obj["description"] = target_desc[:5000] target_obj["languageCode"] = "en-us" # Default language source_images = source_obj["menuItemImageUrl"] if source_images: target_obj["images"] = [ {"uri": source_images} ] else: logging.error( "[Error]product does not have images:%s", target_obj["title"],) continue target_obj["uri"] = source_obj.get("menuItemImageUrl", None) # Price Information source_obj["discounted_price"]= 0 source_obj["retail_price"]= 0 try: item_price = float( source_obj.get( "menuItemCurrentPrice", "0").replace("$", "") ) target_obj["priceInfo"] = { "currencyCode": "INR", "price": item_price, "originalPrice": item_price, "priceRange": {}, } except ValueError as e: print(source_obj.get( "menuItemCurrentPrice", "0").replace("$", "") ) logging.error(e) logging.error("Unable to parse price for %s", target_obj["title"]) continue target_obj["attributes"] = update_attributes( source_obj=source_obj ) # Availability target_obj["availability"] = "IN_STOCK" target_obj["availableQuantity"] = 0 target_obj["fulfillmentInfo"] = [ { "type": "custom-type-1", "placeIds": ["mobile", "www"] } ] target_obj["retrievableFields"] = ( "name,title,brands,uri,categories," "priceInfo,description" ) outfile.write(json.dumps(target_obj) + "\n") except json.JSONDecodeError as e: logging.error(""" ====== * Error decoding JSON object in line: Exception:%s Line:%s} ====== """, e, line.strip()) logging.info( "Successfully transformed %s to %s", input_file, output_file ) return output_file def prepare_arguments() -> dict: """ Configure and parse commandline arguments. Returns: A Dict holds commandline arguments. """ parser = argparse.ArgumentParser( description=("Converting Flipkart dataset " "to Search for Retail data format.") ) parser.add_argument("-i", "--input", help="Flipkart CSV file path.", required=True) parser.add_argument("-o", "--output", help="Search for Retail Jsonl file path.", required=True) parser.add_argument("-p", "--project-number", help="Search for Retail Jsonl file path.", required=True) parser.add_argument("-b", "--branch", help="Search for Retail Jsonl file path.", required=True) args = vars(parser.parse_args()) return { "input_file": args["input"], "output_file": args["output"], "project_number": args["project_number"], "branch": args["branch"] } if __name__ == "__main__": params = prepare_arguments() FLIPKART_CSV_FILE = params["input_file"] FLIPKART_JSON_FILE = params["input_file"] + ".jsonl" RETAIL_SEARCH_JSON_FILE = params["output_file"] PROJECT_NUMBER = params["project_number"] BRANCH = params["branch"] convert_csv_to_jsonl( input_file=FLIPKART_CSV_FILE, output_file=FLIPKART_JSON_FILE ) convert_food_to_retail_search_product( input_file=FLIPKART_JSON_FILE, output_file=RETAIL_SEARCH_JSON_FILE, project_number=PROJECT_NUMBER, branch=BRANCH)