in gemini/sample-apps/accelerating_product_innovation/app/pages_utils/resources_store_embeddings.py [0:0]
def create_and_store_embeddings(uploaded_file: UploadedFile) -> None:
"""Converts the file to data packets.
This function converts the file to data packets.
It checks the file type and processes the file accordingly.
It then uploads the resulting DataFrame to the GCS bucket.
Args:
uploaded_file: The file to convert to data packets.
"""
with st.spinner("Uploading files..."):
uploaded_file_blob = bucket.blob(
f"{st.session_state.product_category}/{uploaded_file.name}"
)
embeddings_df = insights.get_stored_embeddings_as_df()
final_data = []
# Processing for csv/text files.
if uploaded_file.type == "text/csv":
# Read the csv file contents.
df = pd.read_csv(uploaded_file)
uploaded_file_blob.upload_from_string(df.to_csv(), "text/csv")
# Return if file is empty or contents cannot be read.
if df.empty:
return
# Create a list of csv file columns.
header = []
for col in df.columns:
header.append(col)
# Create embeddings and store contents of the csv file
# to the GCS bucket.
with st.spinner("Processing csv...this might take some time..."):
asyncio.run(
csv_processing(df, header, embeddings_df, uploaded_file.name)
)
return
file_content = load_file_content(uploaded_file, uploaded_file_blob)
# Append processed content from the page to final data.
final_data = chunk_and_store_data(
uploaded_file=uploaded_file,
file_content=file_content,
)
if len(final_data) == 0:
return
# Stores the embeddings in the GCS bucket.
with st.spinner("Storing Embeddings"):
# Create a dataframe from final chunked data.
pdf_data = pd.DataFrame.from_dict(final_data)
pdf_data.reset_index(inplace=True, drop=True)
# Add datatype column to df.
pdf_data["types"] = [type(x) for x in pdf_data["content"]]
# Add embedding column to df for text embeddings.
pdf_data["embedding"] = pdf_data["content"].apply(
lambda x: embedding_model_with_backoff([x])
)
pdf_data["embedding"] = pdf_data.embedding.apply(np.array)
# Concatenate the data of newly uploaded files with that of
# existing file embeddings
pdf_data = pd.concat([embeddings_df, pdf_data])
pdf_data = pdf_data.drop_duplicates(subset=["content"], keep="first")
pdf_data.reset_index(inplace=True, drop=True)
# Upload newly created embeddings to gcs
bucket.blob(
f"{st.session_state.product_category}/embeddings.json"
).upload_from_string(pdf_data.to_json(), "application/json")