def generate_interactions()

in generators/generate_interactions_personalize.py [0:0]
187 lines of code
25 McCabe index (conditional complexity)

def generate_interactions(out_interactions_filename, users_df, products_df):
    """Generate items.csv, users.csv from users and product dataframes makes interactions.csv by simulating some
    shopping behaviour."""

    # Count of interactions generated for each event type
    product_viewed_count = 0
    discounted_product_viewed_count = 0
    product_added_count = 0
    discounted_product_added_count = 0
    cart_viewed_count = 0
    discounted_cart_viewed_count = 0
    checkout_started_count = 0
    discounted_checkout_started_count = 0
    order_completed_count = 0
    discounted_order_completed_count = 0

    Path(out_interactions_filename).parents[0].mkdir(parents=True, exist_ok=True)

    # ensure determinism
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    start_time_progress = int(time.time())
    next_timestamp = FIRST_TIMESTAMP
    seconds_increment = int((LAST_TIMESTAMP - FIRST_TIMESTAMP) / min_interactions)
    next_update_progress = start_time_progress + PROGRESS_MONITOR_SECONDS_UPDATE/2

    average_product_price = int(products_df.price.mean())
    print('Average product price: ${:.2f}'.format(average_product_price))

    if seconds_increment <= 0: raise AssertionError(f"Should never happen: {seconds_increment} <= 0")

    print('Minimum interactions to generate: {}'.format(min_interactions))
    print('Starting timestamp: {} ({})'.format(next_timestamp,
                                               time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_timestamp))))
    print('Seconds increment: {}'.format(seconds_increment))

    print("Generating interactions... (this may take a few minutes)")
    interactions = 0

    subsets_cache = {}

    user_to_product = defaultdict(set)

    category_affinity_probs = np.array(CATEGORY_AFFINITY_PROBS)

    print("Writing interactions to: {}".format(out_interactions_filename))

    with open(out_interactions_filename, 'w') as outfile:
        f = csv.writer(outfile)
        f.writerow(["ITEM_ID", "USER_ID", "EVENT_TYPE", "TIMESTAMP", "DISCOUNT"])

        category_frequencies = products_df.category.value_counts()
        category_frequencies /= sum(category_frequencies.values)

        interaction_product_counts = defaultdict(int)

        # Here we build up a list for each category/gender, of product
        # affinities. The product affinity is keyed by one product,
        # so we do not end up with exactly PRODUCT_AFFINITY_N sized
        # cliques. They overlap a little over multiple users
        # - that is why PRODUCT_AFFINITY_N
        # can be a little bit lower than a desired clique size.
        all_categories = products_df.category.unique()
        product_affinities_bycatgender = {}
        for category in all_categories:
            for gender in ['M', 'F']:
                products_cat = products_df.loc[products_df.category==category]
                products_cat = products_cat.loc[
                    products_cat.gender_affinity.isnull()|(products_cat.gender_affinity==gender)].id.values
                # We ensure that all products have PRODUCT_AFFINITY_N products that lead into it
                # and PRODUCT_AFFINITY_N products it leads to
                affinity_matrix = sum([np.roll(np.identity(len(products_cat)), [0, i], [0, 1])
                                       for i in range(PRODUCT_AFFINITY_N)])
                np.random.shuffle(affinity_matrix)
                affinity_matrix = affinity_matrix.T
                np.random.shuffle(affinity_matrix)
                affinity_matrix = affinity_matrix.astype(bool)  # use as boolean index
                affinity_matrix = affinity_matrix | np.identity(len(products_cat), dtype=bool)

                product_infinities = [products_cat[row] for row in affinity_matrix]
                product_affinities_bycatgender[(category, gender)] = {
                    products_cat[i]: products_df.loc[products_df.id.isin(product_infinities[i])]
                    for i in range(len(products_cat))}

        user_category_to_first_prod = {}

        while interactions < min_interactions:
            if (time.time() > next_update_progress):
                rate = interactions / (time.time() - start_time_progress)
                to_go = (min_interactions - interactions) / rate
                print('Generated {} interactions so far (about {} seconds to go)'.format(interactions, int(to_go)))
                next_update_progress += PROGRESS_MONITOR_SECONDS_UPDATE

            # Pick a random user
            user = users_df.loc[random.randint(0, users_df.shape[0] - 1)]

            # Determine category affinity from user's persona
            persona = user['persona']
            # If user persona has sub-categories, we will use those sub-categories to find products for users to partake
            # in interactions with. Otehrwise, we will use the high-level categories.
            has_subcategories = ':' in user['persona']
            preferred_categories_and_subcats = persona.split('_')
            preferred_highlevel_categories = [catstring.split(':')[0] for catstring in preferred_categories_and_subcats]
            # preferred_styles = [catstring.split(':')[1] for catstring in preferred_categories_and_subcats]

            p_normalised = (category_affinity_probs * category_frequencies[preferred_highlevel_categories].values)
            p_normalised /= p_normalised.sum()
            p = NORMALISE_PER_PRODUCT_WEIGHT * p_normalised + (1-NORMALISE_PER_PRODUCT_WEIGHT) * category_affinity_probs

            # Select category based on weighted preference of category order.
            chosen_category_ind = np.random.choice(list(range(len(preferred_categories_and_subcats))), 1, p=p)[0]
            category = preferred_highlevel_categories[chosen_category_ind]
            #category_and_subcat = np.random.choice(preferred_categories_and_subcats, 1, p=p)[0]


            discount_persona = user['discount_persona']

            gender = user['gender']

            if has_subcategories:
                # if there is a preferred style we choose from those products with this style and category
                # but we ignore gender.
                # We also do not attempt to keep balance across categories.
                style = preferred_categories_and_subcats[chosen_category_ind].split(':')[1]
                cachekey = ('category-style', category, style)
                prods_subset_df = subsets_cache.get(cachekey)

                if prods_subset_df is None:
                    # Select products from selected category without gender affinity or that match user's gender
                    prods_subset_df = products_df.loc[(products_df['category']==category) &
                                                      (products_df['style']==style)]
                    # Update cache for quicker lookup next time
                    subsets_cache[cachekey] = prods_subset_df
            else:
                # We are only going to use the machinery to keep things balanced
                # if there is no style appointed on the user preferences.
                # Here, in order to keep the number of products that are related to a product,
                # we restrict the size of the set of products that are recommended to an individual
                # user - in effect, the available subset for a particular category/gender
                # depends on the first product selected, which is selected as per previous logic
                # (looking at category affinities and gender)
                usercat_key = (user['id'], category)  # has this user already selected a "first" product?
                if usercat_key in user_category_to_first_prod:
                    # If a first product is already selected, we use the product affinities for that product
                    # To provide the list of products to select from
                    first_prod = user_category_to_first_prod[usercat_key]
                    prods_subset_df = product_affinities_bycatgender[(category, gender)][first_prod]

                if not usercat_key in user_category_to_first_prod:
                    # If the user has not yet selected a first product for this category
                    # we do it by choosing between all products for gender.

                    # First, check if subset data frame is already cached for category & gender
                    cachekey = ('category-gender', category, gender)
                    prods_subset_df = subsets_cache.get(cachekey)
                    if prods_subset_df is None:
                        # Select products from selected category without gender affinity or that match user's gender
                        prods_subset_df = products_df.loc[(products_df['category'] == category) & (
                                    (products_df['gender_affinity'] == gender) | (products_df['gender_affinity'].isnull()))]
                        # Update cache
                        subsets_cache[cachekey] = prods_subset_df

            # Pick a random product from gender filtered subset
            product = prods_subset_df.sample().iloc[0]

            interaction_product_counts[product.id] += 1

            user_to_product[user['id']].add(product['id'])

            if not usercat_key in user_category_to_first_prod:
                user_category_to_first_prod[usercat_key] = product['id']

            # Decide if the product the user is interacting with is discounted
            if discount_persona == 'discount_indifferent':
                discounted = random.random() < DISCOUNT_PROBABILITY
            elif discount_persona == 'all_discounts':
                discounted = random.random() < DISCOUNT_PROBABILITY_WITH_PREFERENCE
            elif discount_persona == 'lower_priced_products':
                if product.price < average_product_price:
                    discounted = random.random() < DISCOUNT_PROBABILITY_WITH_PREFERENCE
                else:
                    discounted = random.random() < DISCOUNT_PROBABILITY
            else:
                raise ValueError(f'Unable to handle discount persona: {discount_persona}')

            this_timestamp = next_timestamp + random.randint(0, seconds_increment)

            num_interaction_sets_to_insert = 1
            prodcnts = list(interaction_product_counts.values())
            prodcnts_max = max(prodcnts) if len(prodcnts)>0 else 0
            prodcnts_min = min(prodcnts) if len(prodcnts) > 0 else 0
            prodcnts_avg = sum(prodcnts)/len(prodcnts) if len(prodcnts)>0 else 0
            if interaction_product_counts[product.id] * 2 < prodcnts_max:
                num_interaction_sets_to_insert += 1
            if interaction_product_counts[product.id] < prodcnts_avg:
                num_interaction_sets_to_insert += 1
            if interaction_product_counts[product.id] == prodcnts_min:
                num_interaction_sets_to_insert += 1

            for _ in range(num_interaction_sets_to_insert):

                discount_context = 'Yes' if discounted else 'No'

                f.writerow([product['id'],
                            user['id'],
                            'ProductViewed',
                            this_timestamp,
                            discount_context])
                next_timestamp += seconds_increment
                product_viewed_count += 1
                interactions += 1

                if discounted:
                    discounted_product_viewed_count += 1

                if product_added_count < int(product_viewed_count * product_added_percent):
                    this_timestamp += random.randint(0, int(seconds_increment / 2))
                    f.writerow([product['id'],
                                user['id'],
                                'ProductAdded',
                                this_timestamp,
                                discount_context])
                    interactions += 1
                    product_added_count += 1

                    if discounted:
                        discounted_product_added_count += 1

                if cart_viewed_count < int(product_viewed_count * cart_viewed_percent):
                    this_timestamp += random.randint(0, int(seconds_increment / 2))
                    f.writerow([product['id'],
                                user['id'],
                                'CartViewed',
                                this_timestamp,
                                discount_context])
                    interactions += 1
                    cart_viewed_count += 1
                    if discounted:
                        discounted_cart_viewed_count += 1

                if checkout_started_count < int(product_viewed_count * checkout_started_percent):
                    this_timestamp += random.randint(0, int(seconds_increment / 2))
                    f.writerow([product['id'],
                                user['id'],
                                'CheckoutStarted',
                                this_timestamp,
                                discount_context])
                    interactions += 1
                    checkout_started_count += 1
                    if discounted:
                           discounted_checkout_started_count += 1

                if order_completed_count < int(product_viewed_count * order_completed_percent):
                    this_timestamp += random.randint(0, int(seconds_increment / 2))
                    f.writerow([product['id'],
                                user['id'],
                                'OrderCompleted',
                                this_timestamp,
                                discount_context])
                    interactions += 1
                    order_completed_count += 1
                    if discounted:
                        discounted_order_completed_count += 1

    print("Interactions generation done.")
    print(f"Total interactions: {interactions}")
    print(f"Total product viewed: {product_viewed_count} ({discounted_product_viewed_count})")
    print(f"Total product added: {product_added_count} ({discounted_product_added_count})")
    print(f"Total cart viewed: {cart_viewed_count} ({discounted_cart_viewed_count})")
    print(f"Total checkout started: {checkout_started_count} ({discounted_checkout_started_count})")
    print(f"Total order completed: {order_completed_count} ({discounted_order_completed_count})")

    globals().update(locals())   # This can be used for inspecting in console after script ran or if run with ipython.
    print('Generation script finished')