in evaluation_pipeline/retrieval.py [0:0]
def create_embeddings(row_limit, history, embeddings_model_dict):
texts = history['combined_text'].values.tolist()
embeddings_dict = {}
embeddings_sizes = {}
for model in embeddings_model_dict.keys():
if model == 'nomic-ai/nomic-embed-text-v1.5':
prefix = 'search_document: '
texts = [prefix + text for text in texts]
fe = FeatureExtractor(embeddings_model_dict, model_name=model)
embeddings_dict[model] = fe.get_embeddings(texts)
print(model, embeddings_dict[model].shape)
embeddings_sizes[model] = embeddings_dict[model].shape[1]
with open(f"data/embeddings_dict_{row_limit}.pkl", "wb") as f:
pickle.dump(embeddings_dict, f)
with open(f"data/embeddings_sizes_{row_limit}.pkl", "wb") as f:
pickle.dump(embeddings_sizes, f)
return embeddings_dict, embeddings_sizes