def parse_vectors_from_directory_memory_friendly()

in paq/paq_utils.py [0:0]


def parse_vectors_from_directory_memory_friendly(embeddings_dir, size=None):
    paths = get_vectors_file_paths_in_vector_directory(embeddings_dir)
    if size is None:
        size = 0
        for j, p in enumerate(paths):
            logger.info(f'Loading vectors from {p} ({j+1} / {len(paths)}) to find total num vectors')
            m = torch.load(p)
            size += m.shape[0]

    out = None
    offset = 0
    for j, p in enumerate(paths):
        logger.info(f'Loading vectors from {p} ({j+1} / {len(paths)})')
        m = torch.load(p)

        assert int(p.split('.')[-1]) == j, (p, j)
        if out is None:
            out = torch.zeros(size, m.shape[1])
        out[offset: offset + m.shape[0]] = m
        offset += m.shape[0]
    assert offset == size
    logger.info(f'loaded index of shape {out.shape}')

    return out