in identity-resolution/notebooks/identity-graph/nepytune/usecase/similar_audience.py [0:0]
def recommend_similar_audience(g, website_url, categories_limit=3, search_time_limit_in_seconds=15):
"""Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories.
Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes
that are most popular across users of given website
"""
average_guy = (
g.V(website_url)
.in_("visited")
.in_("has_identity").dedup()
.hasLabel("persistentId")
.group().by()
.by(
out("has_identity").out("visited").in_("links_to")
.groupCount().by("categoryCode")
)
.select(Column.values).unfold().unfold()
.group().by(Column.keys)
.by(select(Column.values).mean()).unfold()
.order().by(Column.values, Order.desc)
.limit(categories_limit)
)
most_popular_categories = dict(chain(*category.items()) for category in average_guy.toList())
guy_stats_subquery = (
out("has_identity")
.out("visited").in_("links_to")
.groupCount().by("categoryCode")
.project(*most_popular_categories.keys())
)
conditions_subqueries = []
for i in most_popular_categories:
guy_stats_subquery = guy_stats_subquery.by(choose(select(i), select(i), constant(0)))
conditions_subqueries.append(
select(Column.values).unfold()
.select(i)
.is_(P.gt(int(most_popular_categories[i])))
)
return (
g.V()
.hasLabel("websiteGroup")
.has("categoryCode", P.within(list(most_popular_categories.keys())))
.out("links_to").in_("visited").dedup().in_("has_identity").dedup()
.hasLabel("persistentId")
.where(
out("has_identity").out("visited")
.has("url", P.neq(website_url))
)
.timeLimit(search_time_limit_in_seconds * 1000)
.local(
group().by().by(guy_stats_subquery)
.where(or_(*conditions_subqueries))
)
.select(Column.keys).unfold()
.out("has_identity")
.values("uid")
)