def write_tuning_dataset()

in webhook/main.py [0:0]


def write_tuning_dataset(db: firestore.Client, output_bucket: str) -> int:
    """Write the tuning dataset to Cloud Storage.

    For more information on the tuning dataset file format:
        https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about

    Args:
        db: Firestore client.
        output_bucket: Name of the output bucket.

    Returns: The number of entries in the tuning dataset.
    """
    storage_client = storage.Client()

    documents = [doc.to_dict() or {} for doc in db.collection("documents").stream()]
    doc_pages = {doc["filename"]: doc["pages"] for doc in documents}

    dataset_size = 0
    with storage_client.get_bucket(output_bucket).blob("dataset.jsonl").open("w") as f:
        for doc in db.collection("dataset").stream():
            entry = doc.to_dict() or {}
            context = doc_pages[entry["filename"]][entry["page_number"]]
            row = {
                "systemInstruction": {
                    "parts": [{"text": "Answer the question based on the following text"}],
                },
                "contents": [
                    {
                        "role": "user",
                        "parts": [
                            {"text": f"Text: {context}"},
                            {"text": entry["question"]},
                        ],
                    },
                    {
                        "role": "model",
                        "parts": [{"text": entry["answer"]}],
                    },
                ],
            }
            f.write(f"{json.dumps(row)}\n")
            dataset_size += 1
    return dataset_size