in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]
def get_step_statistics(self, by=StatsBy.TRAINING_PHASE):
"""
Get average, minimum, maximum, p50, p95, p99 stats on step duration
:param by: by default, stats are grouped by framework_metric. The other options are
to get stats by training phase - train/eval/global or grouped by process. This parameter
should be of type StatsBy
"""
if not isinstance(by, StatsBy):
get_logger().info(f"{by} should be of type StatsBy")
return None
by = by.value
step_stats = None
if by in [StatsBy.FRAMEWORK_METRICS.value, StatsBy.PROCESS.value]:
# TODO: Consider that some processes may be optimized.
# For example: data pipeline executed in parallel.
phase_metrics_df = (
self.framework_metrics_df.groupby(["step", by])
.agg({"start_time_us": "min", "end_time_us": "max"})
.reset_index()
)
phase_metrics_df["duration_us"] = (
phase_metrics_df["end_time_us"] - phase_metrics_df["start_time_us"]
)
step_stats = (
phase_metrics_df.groupby([by])["duration_us"]
.describe(percentiles=[0.5, 0.95, 0.99])
.unstack()
.reset_index()
)
step_stats = step_stats.pivot(index=by, columns="level_0", values=0).reset_index()
step_stats.columns.name = ""
step_stats = step_stats.drop(["count", "std"], axis="columns")
step_stats = step_stats[[by, "mean", "min", "max", "50%", "95%", "99%"]]
elif by == StatsBy.TRAINING_PHASE.value:
phase_metrics_df = self.framework_metrics_df[
self.framework_metrics_df["framework_metric"].str.contains("Step:ModeKeys")
]
# multi-processing
phase_metrics_df = (
phase_metrics_df.groupby(["step", "framework_metric"])
.agg({"start_time_us": "min", "end_time_us": "max"})
.reset_index()
)
phase_metrics_df["duration_us"] = (
phase_metrics_df["end_time_us"] - phase_metrics_df["start_time_us"]
)
step_stats = (
phase_metrics_df.groupby(["framework_metric"])["duration_us"]
.describe(percentiles=[0.5, 0.95, 0.99])
.unstack()
.reset_index()
)
step_stats = step_stats.pivot(
index="framework_metric", columns="level_0", values=0
).reset_index()
step_stats.columns.name = ""
step_stats = step_stats.drop(["count", "std"], axis="columns")
step_stats = step_stats[["framework_metric", "mean", "min", "max", "50%", "95%", "99%"]]
if step_stats is not None:
step_stats.columns = [
by,
"duration_mean_us",
"duration_min_us",
"duration_max_us",
"duration_p50_us",
"duration_p95_us",
"duration_p99_us",
]
return step_stats