def get_vector_sample()

in paq/retrievers/build_index.py [0:0]


def get_vector_sample(cached_embeddings_path, sample_fraction):
    samples = []
    max_phi = -1
    N = 0
    vectors = parse_vectors_from_directory(cached_embeddings_path, as_chunks=True)
    for chunk in vectors:
        phis = (chunk ** 2).sum(1)
        max_phi = max(max_phi, phis.max())
        N += chunk.shape[0]
        if sample_fraction == 1.0:
            chunk_sample = chunk
        else:
            chunk_sample = chunk[random.sample(range(0, len(chunk)), int(len(chunk) * sample_fraction))]
        samples.append(chunk_sample)

    del vectors
    vector_sample = torch.cat(samples)
    return vector_sample, max_phi, N