def get_step_statistics()

in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]


    def get_step_statistics(self, by=StatsBy.TRAINING_PHASE):
        """
        Get average, minimum, maximum, p50, p95, p99 stats on step duration
        :param by: by default, stats are grouped by framework_metric. The other options are
        to get stats by training phase - train/eval/global or grouped by process. This parameter
        should be of type StatsBy
        """
        if not isinstance(by, StatsBy):
            get_logger().info(f"{by} should be of type StatsBy")
            return None

        by = by.value
        step_stats = None
        if by in [StatsBy.FRAMEWORK_METRICS.value, StatsBy.PROCESS.value]:
            # TODO: Consider that some processes may be optimized.
            # For example: data pipeline executed in parallel.
            phase_metrics_df = (
                self.framework_metrics_df.groupby(["step", by])
                .agg({"start_time_us": "min", "end_time_us": "max"})
                .reset_index()
            )
            phase_metrics_df["duration_us"] = (
                phase_metrics_df["end_time_us"] - phase_metrics_df["start_time_us"]
            )

            step_stats = (
                phase_metrics_df.groupby([by])["duration_us"]
                .describe(percentiles=[0.5, 0.95, 0.99])
                .unstack()
                .reset_index()
            )
            step_stats = step_stats.pivot(index=by, columns="level_0", values=0).reset_index()
            step_stats.columns.name = ""
            step_stats = step_stats.drop(["count", "std"], axis="columns")
            step_stats = step_stats[[by, "mean", "min", "max", "50%", "95%", "99%"]]
        elif by == StatsBy.TRAINING_PHASE.value:
            phase_metrics_df = self.framework_metrics_df[
                self.framework_metrics_df["framework_metric"].str.contains("Step:ModeKeys")
            ]

            # multi-processing
            phase_metrics_df = (
                phase_metrics_df.groupby(["step", "framework_metric"])
                .agg({"start_time_us": "min", "end_time_us": "max"})
                .reset_index()
            )
            phase_metrics_df["duration_us"] = (
                phase_metrics_df["end_time_us"] - phase_metrics_df["start_time_us"]
            )

            step_stats = (
                phase_metrics_df.groupby(["framework_metric"])["duration_us"]
                .describe(percentiles=[0.5, 0.95, 0.99])
                .unstack()
                .reset_index()
            )
            step_stats = step_stats.pivot(
                index="framework_metric", columns="level_0", values=0
            ).reset_index()
            step_stats.columns.name = ""
            step_stats = step_stats.drop(["count", "std"], axis="columns")
            step_stats = step_stats[["framework_metric", "mean", "min", "max", "50%", "95%", "99%"]]
        if step_stats is not None:
            step_stats.columns = [
                by,
                "duration_mean_us",
                "duration_min_us",
                "duration_max_us",
                "duration_p50_us",
                "duration_p95_us",
                "duration_p99_us",
            ]
        return step_stats