backend-apis/deployment_scripts/media_event_generation.py (75 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Media event generation""" import json import random from datetime import datetime, timedelta, timezone def generate_view_homepage(user_pseudo_id: str, event_time: str): # { # "eventType": "view-home-page", # "userPseudoId": "user-pseudo-id", # "eventTime": "2020-01-01T03:33:33.000001Z", # }; return { "eventType": "view-home-page", "userPseudoId": user_pseudo_id, "eventTime": event_time, } def generate_view_item(user_pseudo_id: str, event_time: str, document_id: str): # { # "eventType": "view-item", # "userPseudoId": "user-pseudo-id", # "eventTime": "2020-01-01T03:33:33.000001Z", # "documents": [{ # "id": "document-id" # }] # } return { "eventType": "view-item", "userPseudoId": user_pseudo_id, "eventTime": event_time, "documents": [{"id": document_id}], } def generate_event_lines(today: datetime, catalog: list[dict]) -> list[str]: documents_len = len(catalog) categories_catalog = {} for product in catalog: if product["categories"] not in categories_catalog: categories_catalog[product["categories"]] = [] categories_catalog[product["categories"]].append(product) users_len = (documents_len - 1) * 20 json_lines = [] for i in range(60): # 60 days day = today - timedelta(days=i + 1) rate = 1440 / (10 * documents_len) for j in range(10 * documents_len): home_page_time = ( (day + timedelta(minutes=int(j * rate))) .isoformat() .replace("+00:00", "Z") ) view_item_time = ( (day + timedelta(minutes=int(j * rate), seconds=30)) .isoformat() .replace("+00:00", "Z") ) user = random.randint(0, users_len) category = catalog[user // 20 - 1]["categories"] document = random.choice(categories_catalog[category]) view_homepage_event = json.dumps( generate_view_homepage( user_pseudo_id=f"user-{user}", event_time=home_page_time ) ) view_item_event = json.dumps( generate_view_item( user_pseudo_id=f"user-{user}", event_time=view_item_time, document_id=str(document["id"]), ) ) json_lines.append(view_homepage_event) json_lines.append(view_item_event) return json_lines def main(): today_datetime = datetime.now(timezone.utc) today_datetime = today_datetime - timedelta( hours=today_datetime.hour, minutes=today_datetime.minute, seconds=today_datetime.second, microseconds=today_datetime.microsecond, ) with open("./dataset/recommendation_products.jsonl") as f: lines = f.readlines() products_json_lines = [json.loads(line) for line in lines] events_json_lines = generate_event_lines( today_datetime, products_json_lines ) with open("full_media_events.jsonl", "w", encoding="utf-8") as jsonl_file: jsonl_file.write("\n".join(events_json_lines)) if __name__ == "__main__": main()