in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]
def get_training_phase_intervals(self, phase=None):
"""
This function splits framework data into before train, train, between train and eval, eval, and after eval.
:param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
are determined for all training phases available. Type: string or List of strings
:return: DataFrame containing the intervals
"""
process_list = self.framework_metrics_df["process"].unique()
if phase is None:
phase = [x for x in process_list if "Step:ModeKeys" in x]
if isinstance(phase, str):
phase = [phase]
if not isinstance(phase, list):
get_logger().info(f"{phase} should be a list of strings")
return None
# Filter out phases that are not available in process list
phase = [x for x in phase if x in process_list]
if len(phase) == 0:
get_logger().info(
f"None of the phase strings matched the phases available in the framework metrics DataFrame"
)
return None
mode_df = self.framework_metrics_df[
self.framework_metrics_df["framework_metric"].isin(phase)
]
training_phases = mode_df["framework_metric"].unique()
if len(phase) > 1:
mode_df = mode_df.groupby(
mode_df["framework_metric"].ne(mode_df["framework_metric"].shift()).cumsum()
)
mode_df = mode_df.apply(
lambda x: pd.DataFrame(
{
"start_time_us": [x["start_time_us"].min()],
"end_time_us": [x["end_time_us"].max()],
"phase": [x["framework_metric"].iloc[0]],
}
)
).reset_index(drop=True)
else:
mode_df = mode_df[["start_time_us", "end_time_us", "framework_metric"]].reset_index(
drop=True
)
mode_df.rename({"framework_metric": "phase"}, axis="columns", inplace=True)
for i in range(len(mode_df.index) - 1):
ind = mode_df.index[i]
next_index = ind + 0.5
this_phase = mode_df["phase"][ind]
next_phase = mode_df["phase"][mode_df.index[i + 1]]
if this_phase in training_phases and next_phase in training_phases:
row = {
"start_time_us": mode_df["end_time_us"][ind] + 1,
"end_time_us": mode_df["start_time_us"][mode_df.index[i + 1]] - 1,
"phase": "Between " + " and ".join(sorted([this_phase, next_phase])),
}
mode_df.loc[next_index] = row
row = {
"start_time_us": self.sys_metrics_df["timestamp_us"].min(),
"end_time_us": mode_df["start_time_us"][0] - 1,
"phase": "Before " + mode_df["phase"][0],
}
mode_df.loc[-1] = row
mode_df = mode_df.sort_index().reset_index(drop=True)
row = {
"start_time_us": mode_df["end_time_us"][mode_df.index[-1]] + 1,
"end_time_us": self.sys_metrics_df["timestamp_us"].max(),
"phase": "After " + mode_df["phase"][mode_df.index[-1]],
}
mode_df.loc[mode_df.index[-1] + 1] = row
return mode_df