in dpr/indexer/faiss_indexers.py [0:0]
def index_data(self, data: List[Tuple[object, np.array]]):
n = len(data)
# max norm is required before putting all vectors in the index to convert inner product similarity to L2
if self.phi > 0:
raise RuntimeError(
"DPR HNSWF index needs to index all data at once," "results will be unpredictable otherwise."
)
phi = 0
for i, item in enumerate(data):
id, doc_vector = item[0:2]
norms = (doc_vector ** 2).sum()
phi = max(phi, norms)
logger.info("HNSWF DotProduct -> L2 space phi={}".format(phi))
self.phi = phi
# indexing in batches is beneficial for many faiss index types
bs = int(self.buffer_size)
for i in range(0, n, bs):
db_ids = [t[0] for t in data[i : i + bs]]
vectors = [np.reshape(t[1], (1, -1)) for t in data[i : i + bs]]
norms = [(doc_vector ** 2).sum() for doc_vector in vectors]
aux_dims = [np.sqrt(phi - norm) for norm in norms]
hnsw_vectors = [np.hstack((doc_vector, aux_dims[i].reshape(-1, 1))) for i, doc_vector in enumerate(vectors)]
hnsw_vectors = np.concatenate(hnsw_vectors, axis=0)
self.train(hnsw_vectors)
self._update_id_mapping(db_ids)
self.index.add(hnsw_vectors)
logger.info("data indexed %d", len(self.index_id_to_db_id))
indexed_cnt = len(self.index_id_to_db_id)
logger.info("Total data indexed %d", indexed_cnt)