def recommend_similar_audience()

in identity-resolution/notebooks/identity-graph/nepytune/usecase/similar_audience.py [0:0]


def recommend_similar_audience(g, website_url, categories_limit=3, search_time_limit_in_seconds=15):
    """Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories.

    Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes
    that are most popular across users of given website
    """
    average_guy = (
        g.V(website_url)
            .in_("visited")
            .in_("has_identity").dedup()
            .hasLabel("persistentId")
            .group().by()
            .by(
                out("has_identity").out("visited").in_("links_to")
                .groupCount().by("categoryCode")
            )
            .select(Column.values).unfold().unfold()
            .group().by(Column.keys)
            .by(select(Column.values).mean()).unfold()
            .order().by(Column.values, Order.desc)
            .limit(categories_limit)
    )

    most_popular_categories = dict(chain(*category.items()) for category in average_guy.toList())

    guy_stats_subquery = (
        out("has_identity")
        .out("visited").in_("links_to")
        .groupCount().by("categoryCode")
        .project(*most_popular_categories.keys())
    )

    conditions_subqueries = []
    for i in most_popular_categories:
        guy_stats_subquery = guy_stats_subquery.by(choose(select(i), select(i), constant(0)))
        conditions_subqueries.append(
            select(Column.values).unfold()
                .select(i)
                .is_(P.gt(int(most_popular_categories[i])))
        )

    return (
            g.V()
                .hasLabel("websiteGroup")
                .has("categoryCode", P.within(list(most_popular_categories.keys())))
                .out("links_to").in_("visited").dedup().in_("has_identity").dedup()
                .hasLabel("persistentId")
                .where(
                    out("has_identity").out("visited")
                           .has("url", P.neq(website_url))
                )
                .timeLimit(search_time_limit_in_seconds * 1000)
                .local(
                    group().by().by(guy_stats_subquery)
                    .where(or_(*conditions_subqueries))
                )
                .select(Column.keys).unfold()
                .out("has_identity")
                .values("uid")
    )