def generate_identity_groups()

in identity-resolution/notebooks/identity-graph/nepytune/cli/add.py [0:0]


def generate_identity_groups(persistent_ids_file, distribution, dst, _seed=None):
    """Write facts about identity_group mapping."""
    if _seed is not None:
        random.seed(time.time())

    with open(persistent_ids_file) as f_h:
        pids = [data["pid"] for data in json_lines_file(f_h)]

    random.shuffle(pids)

    sizes, weights = zip(*[[k, v] for k, v in distribution.items()])
    i = 0
    with open(dst, "w") as f_h:
        while i < len(pids):
            size, *_ = random.choices(sizes, weights=weights)
            size = min(size, abs(len(pids) - i))
            persistent_ids = [pids[i + j] for j in range(size)]
            type_ = "household" if len(persistent_ids) < COMPANY_MIN_SIZE else "company"
            f_h.write(
                json.dumps(
                    {
                        "igid": hash_(persistent_ids),
                        "type": type_,
                        "persistentIds": persistent_ids,
                    }
                )
                + "\n"
            )
            # advance even if size was 0, meaning that persistent id
            # does not belong to any identity_group
            i += size or 1