ccai-insights-sample-data/synthetic-convo-insights.py

# generates synthetic conversation data utilizing Gemini 1.5 Flash # includes fields for metadata # formatted for successful import into CCAI Insights import argparse import datetime import json import random import vertexai import ast import os import time import re from google.oauth2 import service_account from google.cloud import storage from vertexai.preview.language_models import TextGenerationModel from google.cloud import aiplatform from google.api_core import retry from google.api_core.exceptions import ServiceUnavailable from vertexai.generative_models import GenerativeModel, GenerationConfig, Part, FinishReason from collections import deque from fuzzywuzzy import fuzz parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "num_call_logs", nargs="?", default=10000, #UPDATE WITH NUMBER OF OUTPUT FILES YOU NEED type=int, help="Number of call log files to generate", ) args = parser.parse_args() config = vars(args) NUM_CALL_LOG_FILES = config["num_call_logs"] # Authentication SERVICE_ACCOUNT_KEY_FILE = 'XXX.json' # REPLACE WITH YOUR ACTUAL SERVICE ACCOUNT KEY FILE PATH credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_KEY_FILE) # Project and Location Setup (Ensure they match your GCP project) PROJECT_ID = '[Project ID]' # REPLACE WITH YOUR PROJECT ID LOCATION = 'us-central1' # REPLACE AS NEEDED WITH YOUR CHOSEN REGION # Vertex AI Initialization vertexai.init(project=PROJECT_ID, location=LOCATION, credentials=credentials) # Create the Storage Client storage_client = storage.Client(project=PROJECT_ID, credentials=credentials) # Use Gemini 1.5 Flash model, change config variables as needed model = GenerativeModel(model_name="gemini-1.5-flash-001") generation_config = GenerationConfig( temperature=0.5, top_p=1.0, top_k=32, candidate_count=1, max_output_tokens=8192, ) def generate_lists(): # Prompts for Gemini to generate lists (CHANGE PROMPTS AS NEEDED BASED ON YOUR CUSTOMER) service_prompt = """I work for Ulta Beauty. Can you list out 5 common Ulta Beauty related issues customers may call customer support about? Feel free to use the https://www.ulta.com/ website for inspiration on common products or services or to learn more about the company. Make sure to be very specific about the exact product or service a customer is calling Ulta Beauty about. Your output should mirror this structure. Example output: ["Service1","Service2","Service3","Service4","Service5"].Do not generate anything else other than the list""" problem_prompt = """List 5 common problems Ulta Beauty customers might have with their experience at Ulta Beauty. Be as specific and detailed as possible with customer responses, product types, issues, etc. Make sure to be very specific about the exact product, service, or issue a customer is calling about. Here is a website that give examples on common complaints Ulta Beauty customers may have: https://www.reddit.com/r/Ulta/. Here is another website that give examples on common complaints Ulta Beauty customers may have: https://www.bbb.org/us/il/bolingbrook/profile/retail-stores/ulta-beauty-0654-27005363/customer-reviews. Your output should mirror this structure. Example output: ["Problem1","Problem2","Problem3","Problem4","Problem5"].Do not generate anything else other than the list""" greeting_prompt = """List 5 different realistic ways an Ulta Beauty customer service representative might greet a customer on the phone, ensuring each greeting uses a UNIQUE and DISTINCT agent name. Include both a greeting and an offer to assist, mirroring Ulta Beauty's professional style. Replace '[Agent Name]' with a diverse range of realistic first names (both female and male). The agent names should be diverse, representing various genders and cultural backgrounds. Example output: ["Hello, thank you for calling Ulta Beauty. My name is Jose. How may I help you today?", "Welcome to Ulta Beauty! This is Ahmad. How can I assist you?", "Good morning/afternoon/evening! Thank you for contacting Ulta Beauty. This is David. What can I do for you today?","Hello, this is Ava with Ulta Beauty. How can I assist help you beautify your day?","It's a pleasure to assist you at Ulta Beauty. My name is Priya. How may I help you today?"] Do not generate anything else other than the list.""" agent_name_prompt = """Generate a list of 50 DISTINCT and culturally diverse names that could be used for Ulta Beauty customer service representatives. Include names that are uncommon or less frequent, representing a wide range of ethnicities and genders. Ensure no repetition or similarity. Example output: ["Jessica", "Priya", "Zaid", "Diego", "Luis"].""" closing_prompt = """List 5 different ways an Ulta Beauty customer service representative might end a call with a customer.Your output should mirror this structure. Example output: ["Closing greeting1","Closing greeting2","Closing greeting3","Closing greeting4","Closing greeting5"].Do not generate anything else other than the list""" closing_response_prompt = """List 5 different ways a customer might respond to an Ulta Beauty customer service representative ending a call based off the last closing prompt message from the agent. Example output: ["Thank you for your help. Have a great day!", "You're welcome. Goodbye.", "No problem. Thanks for your time."] Do not generate anything else other than the list.""" # Get responses from Gemini and clean up the output for CCAI Insights formatting services = model.generate_content(service_prompt) services_text = services.text.strip()[1:-1].replace('"', '').split(",") problems = model.generate_content(problem_prompt) problems_text = problems.text.strip()[1:-1].replace('"', '').replace("\n", "").split(",") greetings = model.generate_content(greeting_prompt) greetings_text = greetings.text.strip()[1:-1].replace('"', '').split(",") # Filter out unprofessional greetings and ensure Ulta Beauty is mentioned in the initial agent greeting greetings_text = list( filter( lambda g: "ulta beauty" in g.lower() and any(keyword in g.lower() for keyword in ["help", "assist", "welcome"]), greetings_text ) ) agent_names = model.generate_content(agent_name_prompt) agent_names_text = agent_names.text.strip()[1:-1].replace('"', '').split(",") closing_remarks = model.generate_content(closing_prompt) closing_remarks_text = closing_remarks.text.strip()[1:-1].replace('"', '').split(",") closing_responses = model.generate_content(closing_response_prompt) closing_responses_text = closing_responses.text.strip()[1:-1].replace('"', '').replace('-', '').split(",") #Parse with ast.literal_eval return services_text, problems_text, greetings_text, closing_remarks_text, closing_responses_text, agent_names_text def generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names_buffer, max_retries=3, max_regeneration_retries=3): global shuffled_agent_names service = random.choice(services) problem_description = random.choice(problems) # Generate timestamps # Specify the desired start and end years, along with month/day ranges start_year = 2024 # Change to desired start year start_month = 1 # Change to desired start month start_day = 1 # Change to desired start day end_year = 2024 # Change to desired end year end_month = 9 # Change to desired end month end_day = 3 # Change to desired end day # Generate random timestamps within the specified range def generate_random_timestamp(): start_date = datetime.datetime(year=start_year, month=start_month, day=start_day) end_date = datetime.datetime(year=end_year, month=end_month, day=end_day, hour=23, minute=59, second=59) random_datetime = start_date + random.random() * (end_date - start_date) return int(random_datetime.timestamp() * 1000000) # Generate timestamp and response delay timestamp = generate_random_timestamp() response_delay = random.randint(5000000, 10000000) # 5 to 10 seconds for all roles # Increased regeneration threshold (adjust as needed) regeneration_threshold = 3 # Lowered threshold for more frequent regeneration if len(agent_names_buffer) < 2: for _ in range(max_regeneration_retries): print("Regenerating agent names and greetings for more variety...") services, problems, greetings, closing_remarks, closing, new_agent_names = generate_lists() # Regenerate greetings as well agent_names_buffer.extend(new_agent_names) if len(agent_names_buffer) >= 2: break else: print("Max regeneration retries reached. Skipping this call log.") return None customer_behavior = random.choice(["polite and patient", "frustrated and impatient", "angry and demanding", "confused and unsure"]) # Generate a natural problem statement problem_statement_prompt = f""" Rewrite this issue into a natural statement a customer would say to describe their problem with their {service}: "{problem_description}" at the BEGINNING of their call. Make sure the customer provides context to their issue. """ for retry_count in range(max_retries): try: problem_statement_response = model.generate_content(problem_statement_prompt) # Check for safety filter blocks in any candidate for candidate in problem_statement_response.candidates: if candidate.finish_reason == "STOP_REASON_SAFETY": raise Exception("Safety filter triggered. Retrying...") customer_statement = problem_statement_response.text.strip() # --- Generate Metadata (but don't extract agent name yet) --- call_id = random.randint(1000, 999999) #language_code = "en-US" #call_type = random.choice(["inbound", "outbound"]) channel = random.choice(["phone", "chat"]) # Randomly chooses phone or chat #agent_group = random.choice(["Tier 1 Support", "Billing"]) agent_experience = random.choice(["junior", "senior", "manager", "supervisor", "trainee"]) agent_location = random.choice(["US", "India", "EMEA"]) customer_id = random.randint(100, 9999) customer_sentiment = "positive" if customer_behavior == "polite and patient" else "negative" # Example inference customer_region = random.choice(["East Coast", "West Coast", "Central"]) prompt_template = f""" Create a customer support transcript where an Ulta Beauty agent helps a customer with their {service}. The conversation starts with the agent's greeting. Adhere strictly to this format: Agent: {random.choice(greetings)} Customer: {customer_statement} Agent: [Agent's response acknowledging the problem and starting troubleshooting] Customer: [Customer's response to the troubleshooting steps] Agent: [Further troubleshooting or resolution steps] ... (continue the back-and-forth as needed) Agent: [Resolution of the issue or escalation] Agent: {random.choice(closing_remarks)} Customer: [Customer's natural response acknowledging resolution and ending the call] Additional instructions: * Use "{random.choice(greetings)}" for the agent's greeting. * Use "{random.choice(closing_remarks)}" for the agent's closing remark. * The conversation MUST include troubleshooting steps and a resolution. * Focus on a single core issue the customer is experiencing * The customer is "{customer_behavior}" """ print(prompt_template) response = model.generate_content(prompt_template, generation_config=generation_config) transcript = response.text # Check for safety filter blocks in any candidate (not just the first one) for candidate in response.candidates: if candidate.finish_reason == "STOP_REASON_SAFETY": raise Exception("Safety filter triggered. Retrying...") # Enhanced Transcript Parsing with Logic to Prevent Unnatural Endings entries = [] current_speaker = None customer_said_no = False short_customer_response = False agent_asked_anything_else = False last_agent_line = "" for line in transcript.splitlines(): line = line.strip() if line.lower().startswith("customer") or line.lower().startswith("agent"): if line.lower().startswith("customer"): entries.append({"role": "CUSTOMER", "text": line[8:].strip(), "user_id": 1}) if line.lower().strip() in ["no", "no thanks", "that's all", "that's it", "i'm good", "nothing else","okay"]: customer_said_no = True if len(line.lower().strip()) <= 3: short_customer_response = True elif line.lower().startswith("agent"): last_agent_line = line[5:].strip() if "anything else" in last_agent_line.lower(): agent_asked_anything_else = True # Condition to skip the "anything else" response after customer says no if not (customer_said_no and "anything else" in last_agent_line.lower()): entries.append({"role": "AGENT", "text": line[5:].strip(), "user_id": 2}) # Check if the first agent entry has a proper greeting (with fuzzy matching) if entries and entries[0]["role"] == "AGENT": agent_greeting = entries[0]["text"].lower() # Define a list of acceptable greeting keywords/phrases greeting_keywords = ["help", "assist", "welcome", "hello", "hi", "good morning", "good afternoon", "good evening"] # Check if any of the greeting keywords/phrases have a high similarity score with the agent's greeting has_valid_greeting = any( fuzz.partial_ratio(keyword, agent_greeting) > 80 for keyword in greeting_keywords ) if not has_valid_greeting: print("Agent's greeting is missing or incomplete (fuzzy matching). Retrying...") return generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names) # Additional check for the first few agent turns for i, entry in enumerate(entries[:3]): # Check the first 3 agent turns if entry["role"] == "AGENT": if any(product.lower() in entry["text"].lower() for product in services): print("Agent assumed the product too early. Retrying...") return generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names) # Retry Conditions (consolidated for readability) if any(( agent_asked_anything_else and last_agent_line == entries[-1]['text'], customer_said_no and "anything else" in last_agent_line.lower(), short_customer_response, not entries # Check for blank output )): retry_reason = ( "Customer didn't answer 'anything else?'" if agent_asked_anything_else and last_agent_line == entries[-1]["text"] else "Agent asked again after customer said no" if customer_said_no and "anything else" in last_agent_line.lower() else "Customer response too short" if short_customer_response else "Blank output" ) print(f"{retry_reason}. Retrying...") return generate_log( services, problems, greetings, closing_remarks, closing_responses, agent_names, ) # Add timestamps for i, entry in enumerate(entries): entry["start_timestamp_usec"] = timestamp + response_delay * i # *** Extract agent name AFTER populating entries *** agent_name_found = False for entry in entries: if entry["role"] == "AGENT": # Try to find the agent name using the regular expression match = re.search( r"(?:my name is|this is|i'm)\s+([\w\s]+)", entry["text"], re.IGNORECASE, ) if match: agent_name = match.group(1).strip() agent_name_found = True break # Stop after finding the agent name if not agent_name_found: # Handle the case where no agent name is found in the transcript print("No agent greeting found in the transcript. Retrying...") return generate_log( services, problems, greetings, closing_remarks, closing_responses, agent_names_buffer, ) # Generate a unique Agent ID (you can customize this logic) agent_id = random.randint(1000, 9999) # Replace any remaining placeholders #for entry in entries: # if "[agent name]" in entry["text"].lower(): # entry["text"] = entry["text"].replace("[agent name]", agent_name_for_transcript) # Add agent metadata to each agent entry for entry in entries: if entry["role"] == "AGENT": entry["agent_name"] = agent_name # Use the extracted agent_name entry["agent_id"] = agent_id # --- Create metadata dictionary --- metadata = { "call_id": call_id, #"language_code": language_code, #"call_type": call_type, "channel": channel, #"agent_group": agent_group, "agent_experience": agent_experience, "agent_location": agent_location, "customer_id": customer_id, "customer_sentiment": customer_sentiment, "customer_region": customer_region, "agent_id": agent_id, # ... add more metadata as needed } call_log = { "entries": entries, "metadata": metadata, } # Include metadata in the call_log json_object = json.dumps(call_log, indent=4) return json_object # Return the generated JSON if successful except Exception as e: print(f"Error generating log (attempt {retry_count + 1}/{max_retries}): {e}") if retry_count < max_retries - 1: time.sleep(2 ** retry_count) else: print("Max retries reached. Skipping this call log.") return None # Main execution loop services, problems, greetings, closing_remarks, closing, agent_names = generate_lists() random.shuffle(agent_names) # Shuffle the entire list once at the beginning # Create a circular buffer (deque) from the shuffled agent names agent_names_buffer = deque(agent_names) # Main execution loop for i in range(NUM_CALL_LOG_FILES): json_object = generate_log(services, problems, greetings, closing_remarks, closing, agent_names_buffer) if json_object is not None: # Only save if generation was successful # Upload to GCS with overwrite prevention bucket = storage_client.bucket("ulta-enriched4") #UPDATE WITH YOUR GCS BUCKET base_filename = f"gem_chat_{i}_np.json" #CHANGE FILE NAME AS NEEDED filename = base_filename counter = 1 blob = bucket.blob(filename) while blob.exists(): # Check for existing blob filename = f"{os.path.splitext(base_filename)[0]}_{counter}.json" blob = bucket.blob(filename) # Update the blob reference counter += 1 # Retry mechanism for blob upload @retry.Retry(predicate=retry.if_exception_type(ServiceUnavailable), deadline=60) def upload_blob(): blob.upload_from_string(json_object) try: upload_blob() print(f"Uploaded {filename} to GCS bucket ulta-enriched4") #UPDATE WITH YOUR GCS BUCKET except Exception as e: # Catch any remaining errors print(f"Upload failed after retries: {e}") else: print(f"Skipping call log {i} due to an error.")

ccai-insights-sample-data/synthetic-convo-insights.py (258 lines of code) (raw):