in identity-resolution/notebooks/identity-graph/nepytune/cli/add.py [0:0]
def generate_identity_groups(persistent_ids_file, distribution, dst, _seed=None):
"""Write facts about identity_group mapping."""
if _seed is not None:
random.seed(time.time())
with open(persistent_ids_file) as f_h:
pids = [data["pid"] for data in json_lines_file(f_h)]
random.shuffle(pids)
sizes, weights = zip(*[[k, v] for k, v in distribution.items()])
i = 0
with open(dst, "w") as f_h:
while i < len(pids):
size, *_ = random.choices(sizes, weights=weights)
size = min(size, abs(len(pids) - i))
persistent_ids = [pids[i + j] for j in range(size)]
type_ = "household" if len(persistent_ids) < COMPANY_MIN_SIZE else "company"
f_h.write(
json.dumps(
{
"igid": hash_(persistent_ids),
"type": type_,
"persistentIds": persistent_ids,
}
)
+ "\n"
)
# advance even if size was 0, meaning that persistent id
# does not belong to any identity_group
i += size or 1