# generates synthetic conversation data utilizing Gemini 1.5 Flash
# includes fields for metadata
# formatted for successful import into CCAI Insights

import argparse
import datetime
import json
import random
import vertexai
import ast
import os
import time
import re
from google.oauth2 import service_account
from google.cloud import storage
from vertexai.preview.language_models import TextGenerationModel
from google.cloud import aiplatform
from google.api_core import retry
from google.api_core.exceptions import ServiceUnavailable
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part, FinishReason
from collections import deque
from fuzzywuzzy import fuzz

parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
    "num_call_logs",
    nargs="?",
    default=10000, #UPDATE WITH NUMBER OF OUTPUT FILES YOU NEED
    type=int,
    help="Number of call log files to generate",
)
args = parser.parse_args()
config = vars(args)

NUM_CALL_LOG_FILES = config["num_call_logs"]

# Authentication
SERVICE_ACCOUNT_KEY_FILE = 'XXX.json'  # REPLACE WITH YOUR ACTUAL SERVICE ACCOUNT KEY FILE PATH
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_KEY_FILE)

# Project and Location Setup (Ensure they match your GCP project)
PROJECT_ID = '[Project ID]'  # REPLACE WITH YOUR PROJECT ID
LOCATION = 'us-central1'  # REPLACE AS NEEDED WITH YOUR CHOSEN REGION

# Vertex AI Initialization
vertexai.init(project=PROJECT_ID, location=LOCATION, credentials=credentials)

# Create the Storage Client
storage_client = storage.Client(project=PROJECT_ID, credentials=credentials)

# Use Gemini 1.5 Flash model, change config variables as needed
model = GenerativeModel(model_name="gemini-1.5-flash-001")
generation_config = GenerationConfig(
    temperature=0.5,
    top_p=1.0,
    top_k=32,
    candidate_count=1,
    max_output_tokens=8192,
)

def generate_lists():
    # Prompts for Gemini to generate lists (CHANGE PROMPTS AS NEEDED BASED ON YOUR CUSTOMER)
    service_prompt = """I work for Ulta Beauty. Can you list out 5 common Ulta Beauty related issues customers may call customer support about? Feel free to use the https://www.ulta.com/ website for inspiration on common products or services or to learn more about the company. Make sure to be very specific about the exact product or service a customer is calling Ulta Beauty about. Your output should mirror this structure. 
    Example output: ["Service1","Service2","Service3","Service4","Service5"].Do not generate anything else other than the list"""
    problem_prompt = """List 5 common problems Ulta Beauty customers might have with their experience at Ulta Beauty. Be as specific and detailed as possible with customer responses, product types, issues, etc. Make sure to be very specific about the exact product, service, or issue a customer is calling about. Here is a website that give examples on common complaints Ulta Beauty customers may have: https://www.reddit.com/r/Ulta/. Here is another website that give examples on common complaints Ulta Beauty customers may have: https://www.bbb.org/us/il/bolingbrook/profile/retail-stores/ulta-beauty-0654-27005363/customer-reviews.  Your output should mirror this structure. 
    Example output: ["Problem1","Problem2","Problem3","Problem4","Problem5"].Do not generate anything else other than the list"""
    greeting_prompt = """List 5 different realistic ways an Ulta Beauty customer service representative might greet a customer on the phone, ensuring each greeting uses a UNIQUE and DISTINCT agent name. Include both a greeting and an offer to assist, mirroring Ulta Beauty's professional style. Replace '[Agent Name]' with a diverse range of realistic first names (both female and male). The agent names should be diverse, representing various genders and cultural backgrounds.
    Example output: ["Hello, thank you for calling Ulta Beauty. My name is Jose. How may I help you today?", "Welcome to Ulta Beauty! This is Ahmad. How can I assist you?", "Good morning/afternoon/evening! Thank you for contacting Ulta Beauty. This is David. What can I do for you today?","Hello, this is Ava with Ulta Beauty. How can I assist help you beautify your day?","It's a pleasure to assist you at Ulta Beauty. My name is Priya. How may I help you today?"]
    Do not generate anything else other than the list."""
    agent_name_prompt = """Generate a list of 50 DISTINCT and culturally diverse names that could be used for Ulta Beauty customer service representatives. Include names that are uncommon or less frequent, representing a wide range of ethnicities and genders. Ensure no repetition or similarity. 
    Example output: ["Jessica", "Priya", "Zaid", "Diego", "Luis"]."""
    closing_prompt = """List 5 different ways an Ulta Beauty customer service representative might end a call with a customer.Your output should mirror this structure. 
    Example output: ["Closing greeting1","Closing greeting2","Closing greeting3","Closing greeting4","Closing greeting5"].Do not generate anything else other than the list"""
    closing_response_prompt = """List 5 different ways a customer might respond to an Ulta Beauty customer service representative ending a call based off the last closing prompt message from the agent.
    Example output: ["Thank you for your help. Have a great day!", "You're welcome. Goodbye.", "No problem. Thanks for your time."]
    Do not generate anything else other than the list."""
    
    # Get responses from Gemini and clean up the output for CCAI Insights formatting
    services = model.generate_content(service_prompt)
    services_text = services.text.strip()[1:-1].replace('"', '').split(",")

    problems = model.generate_content(problem_prompt)
    problems_text = problems.text.strip()[1:-1].replace('"', '').replace("\n", "").split(",")

    greetings = model.generate_content(greeting_prompt)
    greetings_text = greetings.text.strip()[1:-1].replace('"', '').split(",")

    # Filter out unprofessional greetings and ensure Ulta Beauty is mentioned in the initial agent greeting
    greetings_text = list(
        filter(
            lambda g: "ulta beauty" in g.lower() and any(keyword in g.lower() for keyword in ["help", "assist", "welcome"]),
            greetings_text
        )
    )

    agent_names = model.generate_content(agent_name_prompt)
    agent_names_text = agent_names.text.strip()[1:-1].replace('"', '').split(",")

    closing_remarks = model.generate_content(closing_prompt)
    closing_remarks_text = closing_remarks.text.strip()[1:-1].replace('"', '').split(",")
    
    closing_responses = model.generate_content(closing_response_prompt)
    closing_responses_text = closing_responses.text.strip()[1:-1].replace('"', '').replace('-', '').split(",")

    #Parse with ast.literal_eval
    return services_text, problems_text, greetings_text, closing_remarks_text, closing_responses_text, agent_names_text

def generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names_buffer, max_retries=3, max_regeneration_retries=3):
    global shuffled_agent_names
    service = random.choice(services)
    problem_description = random.choice(problems)

    # Generate timestamps
     # Specify the desired start and end years, along with month/day ranges
    start_year = 2024    # Change to desired start year
    start_month = 1      # Change to desired start month
    start_day = 1       # Change to desired start day
    end_year = 2024      # Change to desired end year
    end_month = 9       # Change to desired end month
    end_day = 3         # Change to desired end day
    
    # Generate random timestamps within the specified range
    def generate_random_timestamp():
        start_date = datetime.datetime(year=start_year, month=start_month, day=start_day)
        end_date = datetime.datetime(year=end_year, month=end_month, day=end_day, hour=23, minute=59, second=59)
    
        random_datetime = start_date + random.random() * (end_date - start_date)
        return int(random_datetime.timestamp() * 1000000)
    
    # Generate timestamp and response delay
    timestamp = generate_random_timestamp()
    response_delay = random.randint(5000000, 10000000)  # 5 to 10 seconds for all roles

    # Increased regeneration threshold (adjust as needed)
    regeneration_threshold = 3  # Lowered threshold for more frequent regeneration

    if len(agent_names_buffer) < 2:
        for _ in range(max_regeneration_retries):
            print("Regenerating agent names and greetings for more variety...")
            services, problems, greetings, closing_remarks, closing, new_agent_names = generate_lists()  # Regenerate greetings as well
            agent_names_buffer.extend(new_agent_names)
            if len(agent_names_buffer) >= 2:
                break
        else:
            print("Max regeneration retries reached. Skipping this call log.")
            return None

    customer_behavior = random.choice(["polite and patient", "frustrated and impatient", "angry and demanding", "confused and unsure"])

    # Generate a natural problem statement
    problem_statement_prompt = f"""
    Rewrite this issue into a natural statement a customer would say to describe their problem with their {service}: "{problem_description}" at the BEGINNING of their call. Make sure the customer provides context to their issue.
    """

    for retry_count in range(max_retries):
        try:
            problem_statement_response = model.generate_content(problem_statement_prompt)

            # Check for safety filter blocks in any candidate
            for candidate in problem_statement_response.candidates:
                if candidate.finish_reason == "STOP_REASON_SAFETY":
                    raise Exception("Safety filter triggered. Retrying...")

            customer_statement = problem_statement_response.text.strip()

            # --- Generate Metadata (but don't extract agent name yet) ---
            call_id = random.randint(1000, 999999)
            #language_code = "en-US"
            #call_type = random.choice(["inbound", "outbound"])
            channel = random.choice(["phone", "chat"])  # Randomly chooses phone or chat
            #agent_group = random.choice(["Tier 1 Support", "Billing"])
            agent_experience = random.choice(["junior", "senior", "manager", "supervisor", "trainee"])
            agent_location = random.choice(["US", "India", "EMEA"])
            customer_id = random.randint(100, 9999)
            customer_sentiment = "positive" if customer_behavior == "polite and patient" else "negative"  # Example inference
            customer_region = random.choice(["East Coast", "West Coast", "Central"])


            prompt_template = f"""
            Create a customer support transcript where an Ulta Beauty agent helps a customer with their {service}.
            The conversation starts with the agent's greeting.
            Adhere strictly to this format:
            Agent: {random.choice(greetings)}
            Customer: {customer_statement}
            Agent: [Agent's response acknowledging the problem and starting troubleshooting]
            Customer: [Customer's response to the troubleshooting steps]
            Agent: [Further troubleshooting or resolution steps]
            ... (continue the back-and-forth as needed)
            Agent: [Resolution of the issue or escalation]
            Agent: {random.choice(closing_remarks)}
            Customer: [Customer's natural response acknowledging resolution and ending the call]

            Additional instructions:

            *   Use "{random.choice(greetings)}" for the agent's greeting.
            *   Use "{random.choice(closing_remarks)}" for the agent's closing remark.
            *   The conversation MUST include troubleshooting steps and a resolution.
            *   Focus on a single core issue the customer is experiencing
            *   The customer is "{customer_behavior}"
            """

            print(prompt_template)

            response = model.generate_content(prompt_template, generation_config=generation_config)

            transcript = response.text

            # Check for safety filter blocks in any candidate (not just the first one)
            for candidate in response.candidates:
                if candidate.finish_reason == "STOP_REASON_SAFETY":
                    raise Exception("Safety filter triggered. Retrying...")

            # Enhanced Transcript Parsing with Logic to Prevent Unnatural Endings
            entries = []
            current_speaker = None
            customer_said_no = False
            short_customer_response = False
            agent_asked_anything_else = False
            last_agent_line = ""

            for line in transcript.splitlines():
                line = line.strip()
                if line.lower().startswith("customer") or line.lower().startswith("agent"):
                    if line.lower().startswith("customer"):
                        entries.append({"role": "CUSTOMER", "text": line[8:].strip(), "user_id": 1})
                        if line.lower().strip() in ["no", "no thanks", "that's all", "that's it", "i'm good", "nothing else","okay"]:
                            customer_said_no = True
                        if len(line.lower().strip()) <= 3:
                            short_customer_response = True
                    elif line.lower().startswith("agent"):
                        last_agent_line = line[5:].strip()
                        if "anything else" in last_agent_line.lower():
                            agent_asked_anything_else = True
                        
                        # Condition to skip the "anything else" response after customer says no
                        if not (customer_said_no and "anything else" in last_agent_line.lower()):
                            entries.append({"role": "AGENT", "text": line[5:].strip(), "user_id": 2})

            # Check if the first agent entry has a proper greeting (with fuzzy matching)
            if entries and entries[0]["role"] == "AGENT":
                agent_greeting = entries[0]["text"].lower()
            
                # Define a list of acceptable greeting keywords/phrases
                greeting_keywords = ["help", "assist", "welcome", "hello", "hi", "good morning", "good afternoon", "good evening"]
            
                # Check if any of the greeting keywords/phrases have a high similarity score with the agent's greeting
                has_valid_greeting = any(
                    fuzz.partial_ratio(keyword, agent_greeting) > 80
                    for keyword in greeting_keywords
                )
            
                if not has_valid_greeting:
                    print("Agent's greeting is missing or incomplete (fuzzy matching). Retrying...")
                    return generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names)
            
            # Additional check for the first few agent turns
            for i, entry in enumerate(entries[:3]):  # Check the first 3 agent turns
                if entry["role"] == "AGENT":
                    if any(product.lower() in entry["text"].lower() for product in services):
                        print("Agent assumed the product too early. Retrying...")
                        return generate_log(services, problems, greetings, closing_remarks, closing_responses, agent_names)

            # Retry Conditions (consolidated for readability)
            if any((
                agent_asked_anything_else and last_agent_line == entries[-1]['text'],
                customer_said_no and "anything else" in last_agent_line.lower(),
                short_customer_response,
                not entries  # Check for blank output
            )):
                retry_reason = (
                    "Customer didn't answer 'anything else?'"
                    if agent_asked_anything_else
                    and last_agent_line == entries[-1]["text"]
                    else "Agent asked again after customer said no"
                    if customer_said_no and "anything else" in last_agent_line.lower()
                    else "Customer response too short"
                    if short_customer_response
                    else "Blank output"
                )
                print(f"{retry_reason}. Retrying...")
                return generate_log(
                    services,
                    problems,
                    greetings,
                    closing_remarks,
                    closing_responses,
                    agent_names,
                )

            # Add timestamps
            for i, entry in enumerate(entries):
                entry["start_timestamp_usec"] = timestamp + response_delay * i


            # *** Extract agent name AFTER populating entries ***
            agent_name_found = False
            for entry in entries:
                if entry["role"] == "AGENT":
                    # Try to find the agent name using the regular expression
                    match = re.search(
                        r"(?:my name is|this is|i'm)\s+([\w\s]+)",
                        entry["text"],
                        re.IGNORECASE,
                    )
                    if match:
                        agent_name = match.group(1).strip()
                        agent_name_found = True
                        break  # Stop after finding the agent name

            if not agent_name_found:
                # Handle the case where no agent name is found in the transcript
                print("No agent greeting found in the transcript. Retrying...")
                return generate_log(
                    services,
                    problems,
                    greetings,
                    closing_remarks,
                    closing_responses,
                    agent_names_buffer,
                )


            # Generate a unique Agent ID (you can customize this logic)
            agent_id = random.randint(1000, 9999)
            
            # Replace any remaining placeholders
            #for entry in entries:
             #   if "[agent name]" in entry["text"].lower():
              #      entry["text"] = entry["text"].replace("[agent name]", agent_name_for_transcript)

            
            # Add agent metadata to each agent entry
            for entry in entries:
                if entry["role"] == "AGENT":
                    entry["agent_name"] = agent_name  # Use the extracted agent_name
                    entry["agent_id"] = agent_id

            
            # --- Create metadata dictionary ---
            metadata = {
                "call_id": call_id,
                #"language_code": language_code,
                #"call_type": call_type,
                "channel": channel,
                #"agent_group": agent_group,
                "agent_experience": agent_experience,
                "agent_location": agent_location,
                "customer_id": customer_id,
                "customer_sentiment": customer_sentiment,
                "customer_region": customer_region,
                "agent_id": agent_id,
                # ... add more metadata as needed
            }


            call_log = {
                "entries": entries,
                "metadata": metadata,
            }  # Include metadata in the call_log
            json_object = json.dumps(call_log, indent=4)
            return json_object  # Return the generated JSON if successful

        except Exception as e:
            print(f"Error generating log (attempt {retry_count + 1}/{max_retries}): {e}")
            if retry_count < max_retries - 1:
                time.sleep(2 ** retry_count)
            else:
                print("Max retries reached. Skipping this call log.")
                return None


# Main execution loop

services, problems, greetings, closing_remarks, closing, agent_names = generate_lists()
random.shuffle(agent_names)  # Shuffle the entire list once at the beginning

# Create a circular buffer (deque) from the shuffled agent names
agent_names_buffer = deque(agent_names)

# Main execution loop
for i in range(NUM_CALL_LOG_FILES):
    json_object = generate_log(services, problems, greetings, closing_remarks, closing, agent_names_buffer)

    if json_object is not None:  # Only save if generation was successful
        # Upload to GCS with overwrite prevention
        bucket = storage_client.bucket("ulta-enriched4") #UPDATE WITH YOUR GCS BUCKET
        base_filename = f"gem_chat_{i}_np.json" #CHANGE FILE NAME AS NEEDED
        filename = base_filename
        counter = 1

        blob = bucket.blob(filename)
        while blob.exists():  # Check for existing blob
            filename = f"{os.path.splitext(base_filename)[0]}_{counter}.json"
            blob = bucket.blob(filename)  # Update the blob reference
            counter += 1

         # Retry mechanism for blob upload
        @retry.Retry(predicate=retry.if_exception_type(ServiceUnavailable), deadline=60)
        def upload_blob():
            blob.upload_from_string(json_object)

        try:
            upload_blob()
            print(f"Uploaded {filename} to GCS bucket ulta-enriched4") #UPDATE WITH YOUR GCS BUCKET
        except Exception as e:  # Catch any remaining errors
            print(f"Upload failed after retries: {e}")
    else:
        print(f"Skipping call log {i} due to an error.")