def create_embeddings()

in evaluation_pipeline/retrieval.py [0:0]


def create_embeddings(row_limit, history, embeddings_model_dict):
    texts = history['combined_text'].values.tolist()
    embeddings_dict = {}
    embeddings_sizes = {}

    for model in embeddings_model_dict.keys():
        if model == 'nomic-ai/nomic-embed-text-v1.5':
            prefix = 'search_document: '
            texts = [prefix + text for text in texts]
        fe = FeatureExtractor(embeddings_model_dict, model_name=model)
        embeddings_dict[model] = fe.get_embeddings(texts)
        print(model, embeddings_dict[model].shape)
        embeddings_sizes[model] = embeddings_dict[model].shape[1]

    with open(f"data/embeddings_dict_{row_limit}.pkl", "wb") as f:
        pickle.dump(embeddings_dict, f)

    with open(f"data/embeddings_sizes_{row_limit}.pkl", "wb") as f:
        pickle.dump(embeddings_sizes, f)

    return embeddings_dict, embeddings_sizes