in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]
def prep_context():
preprocessed = pd.read_csv(f"gs://{BUCKET}/{DATASET_INPUT}/{DATASET_INPUT_FILE}")
# renaming column name
preprocessed.rename(
columns={
"uniq_id": "Id",
"product_name": "Name",
"description": "Description",
"brand": "Brand",
"attributes": "Specifications",
},
inplace=True,
)
df = preprocessed[
[
"Name",
"Description",
"Specifications",
"Brand",
"c0_name",
"c1_name",
"c2_name",
"c3_name",
]
]
# Filter only clothing products
filtered_df = df[df["c0_name"] == "Clothing"]
# Filter only Women, Men & Kids clothing products
values_to_filter = ["Women's Clothing", "Men's Clothing", "Kids' Clothing"]
clothing_filtered_df = filtered_df[filtered_df["c1_name"].isin(values_to_filter)]
# Filter to keep rows where 'c2_name' has count >=10
c2_filtered_df = filter_low_value_count_rows(
clothing_filtered_df, "c2_name", min_count=10
)
# Filter to keep rows where 'c3_name' has count >=10
c3_filtered_df = filter_low_value_count_rows(
c2_filtered_df, "c3_name", min_count=10
)
# Data Format expected for finetuning: {"context": " ", "question": " ", "answer": " "}
context_df = c3_filtered_df[["Name", "Description", "c1_name", "Specifications"]]
finetune_ds = pd.DataFrame(columns=["context", "question", "answer"])
finetune_ds["context"] = (
"Product Name: "
+ context_df["Name"]
+ "<br> Product Category: "
+ context_df["c1_name"]
+ "<br> Attributes: "
+ context_df["Specifications"]
+ " <br> Description: "
+ context_df["Description"]
)
finetune_ds["c1_name"] = context_df["c1_name"]
return finetune_ds