in data_measurements/embeddings/embeddings.py [0:0]
def finalize_node(node, nodes, min_cluster_size):
"""Post-process nodes to sort children by descending weight,
get full list of leaves in the sub-tree, and direct links
to the cildren nodes, then recurses to all children.
Nodes with fewer than `min_cluster_size` descendants are collapsed
into a single leaf.
"""
node["children"] = sorted(
[
finalize_node(nodes[cid], nodes, min_cluster_size)
for cid in node["children_ids"]
],
key=lambda x: x["weight"],
reverse=True,
)
if node["depth"] > 0:
node["example_ids"] = [
eid for child in node["children"] for eid in child["example_ids"]
]
node["children"] = [
child for child in node["children"] if child["weight"] >= min_cluster_size
]
assert node["weight"] == len(node["example_ids"]), print(node)
return node