def prep_context()

in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]


def prep_context():
    preprocessed = pd.read_csv(f"gs://{BUCKET}/{DATASET_INPUT}/{DATASET_INPUT_FILE}")
    # renaming column name
    preprocessed.rename(
        columns={
            "uniq_id": "Id",
            "product_name": "Name",
            "description": "Description",
            "brand": "Brand",
            "attributes": "Specifications",
        },
        inplace=True,
    )
    df = preprocessed[
        [
            "Name",
            "Description",
            "Specifications",
            "Brand",
            "c0_name",
            "c1_name",
            "c2_name",
            "c3_name",
        ]
    ]

    # Filter only clothing products
    filtered_df = df[df["c0_name"] == "Clothing"]

    # Filter only Women, Men & Kids clothing products
    values_to_filter = ["Women's Clothing", "Men's Clothing", "Kids' Clothing"]
    clothing_filtered_df = filtered_df[filtered_df["c1_name"].isin(values_to_filter)]

    # Filter to keep rows where 'c2_name' has count >=10
    c2_filtered_df = filter_low_value_count_rows(
        clothing_filtered_df, "c2_name", min_count=10
    )

    # Filter to keep rows where 'c3_name' has count >=10
    c3_filtered_df = filter_low_value_count_rows(
        c2_filtered_df, "c3_name", min_count=10
    )

    # Data Format expected for finetuning: {"context": " ", "question": " ", "answer": " "}
    context_df = c3_filtered_df[["Name", "Description", "c1_name", "Specifications"]]
    finetune_ds = pd.DataFrame(columns=["context", "question", "answer"])
    finetune_ds["context"] = (
        "Product Name: "
        + context_df["Name"]
        + "<br> Product Category: "
        + context_df["c1_name"]
        + "<br> Attributes: "
        + context_df["Specifications"]
        + " <br> Description: "
        + context_df["Description"]
    )

    finetune_ds["c1_name"] = context_df["c1_name"]
    return finetune_ds