in data_measurements/embeddings/embeddings.py [0:0]
def make_embeddings(self):
"""
Batch computes the embeddings of the Dataset self.text_dset,
using the field self.text_field_name as input.
Returns:
Dataset: HF dataset object with a single EMBEDDING_FIELD field
corresponding to the embeddings (list of floats)
"""
def batch_embed_sentences(sentences):
return {
EMBEDDING_FIELD: [
embed.tolist()
for embed in self.compute_sentence_embeddings(
sentences[self.text_field_name]
)
]
}
self.embeddings_dset = self.text_dset.map(
batch_embed_sentences,
batched=True,
batch_size=32,
remove_columns=[self.text_field_name],
)
return self.embeddings_dset