in text-semantic-search/index_builder/builder/index.py [0:0]
def build_index(embedding_files_pattern, index_filename,
num_trees=100):
annoy_index = AnnoyIndex(VECTOR_LENGTH, metric=METRIC)
mapping = {}
embed_files = tf.gfile.Glob(embedding_files_pattern)[:250]
logging.info('{} embedding files are found.'.format(len(embed_files)))
item_counter = 0
for f, embed_file in enumerate(embed_files):
logging.info('Loading embeddings in file {} of {}...'.format(
f, len(embed_files)))
record_iterator = tf.python_io.tf_record_iterator(
path=embed_file)
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
string_identifier = example.features.feature['id'].bytes_list.value[0]
mapping[item_counter] = string_identifier
embedding = np.array(
example.features.feature['embedding'].float_list.value)
annoy_index.add_item(item_counter, embedding)
item_counter += 1
logging.info('Loaded {} items to the index'.format(item_counter))
logging.info('Start building the index with {} trees...'.format(num_trees))
annoy_index.build(n_trees=num_trees)
logging.info('Index is successfully built.')
logging.info('Saving index to disk...')
annoy_index.save(index_filename)
logging.info('Index is saved to disk.')
logging.info("Index file size: {} GB".format(
round(os.path.getsize(index_filename) / float(1024 ** 3), 2)))
annoy_index.unload()
logging.info('Saving mapping to disk...')
with open(index_filename + '.mapping', 'wb') as handle:
pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
logging.info('Mapping is saved to disk.')
logging.info("Mapping file size: {} MB".format(
round(os.path.getsize(index_filename + '.mapping') / float(1024 ** 2), 2)))