def __calculate_summary_stats()

in frauddetector/profiler.py [0:0]


    def __calculate_summary_stats(self, data, event_column="EVENT_LABEL"):
        """ Generate summary statistics for a panda's data frame 
            
            Args:
                data (pandas.core.frame.DataFrame): panda's dataframe to create summary statistics for
                event_column (str): column that contains the target event
            Returns:
                df_stats (pandas.core.frame.DataFrame): DataFrame of summary statistics, training data schema, event variables and event lables
        """
        df = data.copy("deep")
        rowcnt = len(df)
        df[event_column] = df[event_column].astype('str', errors='ignore')
        df_s1  = df.agg(['count', 'nunique']).transpose().reset_index().rename(columns={"index":"feature_name"})
        df_s1["null"] = (rowcnt - df_s1["count"]).astype('int64')
        df_s1["not_null"] = rowcnt - df_s1["null"]
        df_s1["null_pct"] = df_s1["null"] / rowcnt
        df_s1["nunique_pct"] = df_s1['nunique']/ rowcnt
        dt = pd.DataFrame(df.dtypes).reset_index().rename(columns={"index":"feature_name", 0:"dtype"})
        df_stats = pd.merge(dt, df_s1, on='feature_name', how='inner').round(4)
        df_stats['nunique'] = df_stats['nunique'].astype('int64')
        df_stats['count'] = df_stats['count'].astype('int64')
        return df_stats