def data_prep()

in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]


def data_prep(finetune_ds):
    total_entries = len(finetune_ds)
    logger.info(f"Generating content for {total_entries} entries")

    result = pd.DataFrame()
    current_entry = 1
    for context, category in zip(finetune_ds["context"], finetune_ds["c1_name"]):
        logger.info(
            f"Processing entry {current_entry} of {total_entries}",
            extra={"context": context},
        )
        if context != np.nan:
            temp_df = generate_qa(context, category)
            if temp_df is not None:
                result = pd.concat([result, temp_df], ignore_index=True)
                logger.info(f"Content generated for entry {current_entry}")
        current_entry += 1
    # Now `result` contains all generated questions and answers
    return result