in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]
def filter_low_value_count_rows(df, column_name, min_count=10):
"""
Removes rows from a DataFrame where the value count in the specified column is less than the given minimum count.
Args:
df: The Pandas DataFrame to filter.
column_name: The name of the column to check value counts for.
min_count: The minimum value count required for a row to be kept (default: 10).
Returns:
A new DataFrame with rows removed where value counts are below the threshold.
"""
# Calculate value counts for the specified column
value_counts = df[column_name].value_counts()
# Filter values that meet the minimum count criteria
filtered_values = value_counts[value_counts >= min_count].index
# Create a new DataFrame keeping only rows with those values
filtered_df = df[df[column_name].isin(filtered_values)]
return filtered_df