in webhook/main.py [0:0]
def write_tuning_dataset(db: firestore.Client, output_bucket: str) -> int:
"""Write the tuning dataset to Cloud Storage.
For more information on the tuning dataset file format:
https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about
Args:
db: Firestore client.
output_bucket: Name of the output bucket.
Returns: The number of entries in the tuning dataset.
"""
storage_client = storage.Client()
documents = [doc.to_dict() or {} for doc in db.collection("documents").stream()]
doc_pages = {doc["filename"]: doc["pages"] for doc in documents}
dataset_size = 0
with storage_client.get_bucket(output_bucket).blob("dataset.jsonl").open("w") as f:
for doc in db.collection("dataset").stream():
entry = doc.to_dict() or {}
context = doc_pages[entry["filename"]][entry["page_number"]]
row = {
"systemInstruction": {
"parts": [{"text": "Answer the question based on the following text"}],
},
"contents": [
{
"role": "user",
"parts": [
{"text": f"Text: {context}"},
{"text": entry["question"]},
],
},
{
"role": "model",
"parts": [{"text": entry["answer"]}],
},
],
}
f.write(f"{json.dumps(row)}\n")
dataset_size += 1
return dataset_size