in retrieval_service/run_generate_policy_dataset.py [0:0]
def vectorize(chunked):
embed_service = VertexAIEmbeddings(model_name=EMBEDDING_MODEL_NAME)
def retry_with_backoff(func, *args, retry_delay=5, backoff_factor=2, **kwargs):
max_attempts = 3
retries = 0
for i in range(max_attempts):
try:
return func(*args, **kwargs)
except Exception as e:
print(f"error: {e}")
retries += 1
wait = retry_delay * (backoff_factor**retries)
print(f"Retry after waiting for {wait} seconds...")
time.sleep(wait)
batch_size = 5
for i in range(0, len(chunked), batch_size):
request = [x["content"] for x in chunked[i : i + batch_size]]
response = retry_with_backoff(embed_service.embed_documents, request)
# Store the retrieved vector embeddings for each chunk back.
for x, e in zip(chunked[i : i + batch_size], response):
x["embedding"] = e
data_embeddings = pd.DataFrame(chunked)
data_embeddings.head()
return data_embeddings