in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]
def data_prep(finetune_ds):
total_entries = len(finetune_ds)
logger.info(f"Generating content for {total_entries} entries")
result = pd.DataFrame()
current_entry = 1
for context, category in zip(finetune_ds["context"], finetune_ds["c1_name"]):
logger.info(
f"Processing entry {current_entry} of {total_entries}",
extra={"context": context},
)
if context != np.nan:
temp_df = generate_qa(context, category)
if temp_df is not None:
result = pd.concat([result, temp_df], ignore_index=True)
logger.info(f"Content generated for entry {current_entry}")
current_entry += 1
# Now `result` contains all generated questions and answers
return result