in components/utils/converter.py [0:0]
def write_jsonl(in_bucket_name, in_path, out_bucket_name=None, out_path=None) -> None:
in_bucket = storage_client.bucket(in_bucket_name)
if out_bucket_name:
out_bucket = storage_client.bucket(out_bucket_name)
else:
out_bucket = in_bucket
jsonl_file = "out.jsonl"
blobs = in_bucket.list_blobs(prefix=in_path)
with jsonlines.open(jsonl_file, mode="w") as writer:
for blob in blobs:
file_name = blob.name
mime_type = get_mime_type(file_name)
if mime_type:
data = {
"id": str(generate_document_id(file_name)),
"structData": struct_data(blob),
"content": {
"mimeType": mime_type,
"uri": f"gs://{in_bucket_name}/{file_name}",
},
}
writer.write(data)
output_file_path = f"{out_path}/{jsonl_file}"
blob = out_bucket.blob(output_file_path)
blob.content_type = "application/json"
blob.upload_from_filename(jsonl_file)
print(f"JSON files merged and written to gs://{out_bucket_name}/{output_file_path}")