in paq/retrievers/build_index.py [0:0]
def get_vector_sample(cached_embeddings_path, sample_fraction):
samples = []
max_phi = -1
N = 0
vectors = parse_vectors_from_directory(cached_embeddings_path, as_chunks=True)
for chunk in vectors:
phis = (chunk ** 2).sum(1)
max_phi = max(max_phi, phis.max())
N += chunk.shape[0]
if sample_fraction == 1.0:
chunk_sample = chunk
else:
chunk_sample = chunk[random.sample(range(0, len(chunk)), int(len(chunk) * sample_fraction))]
samples.append(chunk_sample)
del vectors
vector_sample = torch.cat(samples)
return vector_sample, max_phi, N