def filter_low_value_count_rows()

in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]


def filter_low_value_count_rows(df, column_name, min_count=10):
    """
    Removes rows from a DataFrame where the value count in the specified column is less than the given minimum count.

    Args:
        df: The Pandas DataFrame to filter.
        column_name: The name of the column to check value counts for.
        min_count: The minimum value count required for a row to be kept (default: 10).

    Returns:
        A new DataFrame with rows removed where value counts are below the threshold.
    """

    # Calculate value counts for the specified column
    value_counts = df[column_name].value_counts()

    # Filter values that meet the minimum count criteria
    filtered_values = value_counts[value_counts >= min_count].index

    # Create a new DataFrame keeping only rows with those values
    filtered_df = df[df[column_name].isin(filtered_values)]

    return filtered_df