jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py (39 lines of code) (raw):

import numpy as np import pandas as pd def percentile(p: int = 50, name_format: str = "p{:02.0f}"): """A method to calculate percentiles along dataframe axes via the `pandas.agg` method.""" def f(x): return x.quantile(p / 100) f.__name__ = name_format.format(p) return f def aggregate_to_period( df: pd.DataFrame, period: str, aggregation: callable = np.sum, date_col: str = "submission_date", ) -> pd.DataFrame: """Floor dates to the correct period and aggregate.""" if period.lower() not in ["day", "month", "year"]: raise ValueError( f"Don't know how to floor dates by {period}. Please use 'day', 'month', or 'year'." ) x = df.copy(deep=True) x[date_col] = pd.to_datetime(x[date_col]).dt.to_period(period[0]).dt.to_timestamp() # treat numeric and string types separately x_string = x.select_dtypes(include=["datetime64", object]) x_numeric = x.select_dtypes(include=["float", "int", "datetime64"]) if set(x_string.columns) | set(x_numeric.columns) != set(x.columns): missing_columns = set(x.columns) - ( set(x_string.columns) | set(x_numeric.columns) ) missing_columns_str = ",".join(missing_columns) raise ValueError( f"Columns do not have string or numeric type: {missing_columns_str}" ) x_numeric_agg = x_numeric.groupby(date_col).agg(aggregation).reset_index() # all values of x_string should be the same because it is just the dimensions x_string_agg = x_string.drop_duplicates().reset_index(drop=True) if len(x_string_agg) != len(x_numeric_agg): raise ValueError( "String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation" ) # unique preseves order so we should be fine to concat output_df = pd.concat( [x_numeric_agg, x_string_agg.drop(columns=[date_col])], axis=1 ) return output_df