in mozetl/taar/taar_similarity.py [0:0]
def compute_clusters(addons_df, num_clusters, random_seed):
"""Performs user clustering by using add-on ids as features."""
# Build the stages of the pipeline. We need hashing to make the next
# steps work.
hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features")
idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1)
# As a future improvement, we may add a sane value for the minimum cluster size
# to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
# to pass along the random seed if needed for tests.
kmeans_kwargs = {"seed": random_seed} if random_seed else {}
bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])
# Run the pipeline and compute the results.
model = pipeline.fit(addons_df)
return model.transform(addons_df).select(["client_id", "prediction"])