in sample_app/cerebral_genai/code/rag-on-edge-vectorDB/modules/VDBModule/main.py [0:0]
def upload_file():
data = request.json
index_name = data.get('index_name')
base64_data = data.get('file_data')
# ids = data.get('ids')
# documents = data.get('documents')
if not index_name or not base64_data:
return jsonify({'status': 'error', 'message': 'Index name or file_data are not provided'}), 400
try:
# Convert Base64-encoded string back to bytes
bytes_data = base64.b64decode(base64_data.encode('utf-8'))
pdf_file = BytesIO(bytes_data)
# read pdf file
pdf_reader = NormalizeText()
longtxt = pdf_reader.get_doc_content_txt(pdf_file)
pdf_reader = LangChanSplitter()
stirnglist = pdf_reader.TokenTextSplitter(100,10,longtxt)
df = pd.DataFrame({'document': stirnglist})
df = df.dropna()
df['id'] = df.apply(lambda x : str(uuid.uuid4()), axis=1)
# split df to 50 records per batch
df_array = np.array_split(df, len(df) // 50 + 1)
data_array_count = len(df_array)
new_df_array = []
current_job_number = 1
for sub_df in df_array:
logging.info("working on: " + str(current_job_number) + "/" +str(data_array_count))
documents = sub_df["document"].to_list()
ids = sub_df["id"].to_list()
chromaHelper.upload_documents(index_name, ids, documents)
new_df_array.append(sub_df)
current_job_number+=1
new_df = pd.concat(new_df_array, axis=0, ignore_index=True)
logging.info(str(len(new_df)) + " records uploaded.")
return jsonify({'status': 'success', 'message': f'{str(len(new_df))} records uploaded successfully'})
except Exception as e:
return jsonify({'status': 'error', 'message': f'Error uploading file: {str(e)}'}), 500