def get_training_phase

def get_training_phase_intervals()

in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]
63 lines of code
13 McCabe index (conditional complexity)

    def get_training_phase_intervals(self, phase=None):
        """
        This function splits framework data into before train, train, between train and eval, eval, and after eval.
        :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
        are determined for all training phases available. Type: string or List of strings
        :return: DataFrame containing the intervals
        """
        process_list = self.framework_metrics_df["process"].unique()
        if phase is None:
            phase = [x for x in process_list if "Step:ModeKeys" in x]

        if isinstance(phase, str):
            phase = [phase]

        if not isinstance(phase, list):
            get_logger().info(f"{phase} should be a list of strings")
            return None

        # Filter out phases that are not available in process list
        phase = [x for x in phase if x in process_list]

        if len(phase) == 0:
            get_logger().info(
                f"None of the phase strings matched the phases available in the framework metrics DataFrame"
            )
            return None

        mode_df = self.framework_metrics_df[
            self.framework_metrics_df["framework_metric"].isin(phase)
        ]
        training_phases = mode_df["framework_metric"].unique()
        if len(phase) > 1:
            mode_df = mode_df.groupby(
                mode_df["framework_metric"].ne(mode_df["framework_metric"].shift()).cumsum()
            )
            mode_df = mode_df.apply(
                lambda x: pd.DataFrame(
                    {
                        "start_time_us": [x["start_time_us"].min()],
                        "end_time_us": [x["end_time_us"].max()],
                        "phase": [x["framework_metric"].iloc[0]],
                    }
                )
            ).reset_index(drop=True)
        else:
            mode_df = mode_df[["start_time_us", "end_time_us", "framework_metric"]].reset_index(
                drop=True
            )
            mode_df.rename({"framework_metric": "phase"}, axis="columns", inplace=True)

        for i in range(len(mode_df.index) - 1):
            ind = mode_df.index[i]
            next_index = ind + 0.5
            this_phase = mode_df["phase"][ind]
            next_phase = mode_df["phase"][mode_df.index[i + 1]]
            if this_phase in training_phases and next_phase in training_phases:
                row = {
                    "start_time_us": mode_df["end_time_us"][ind] + 1,
                    "end_time_us": mode_df["start_time_us"][mode_df.index[i + 1]] - 1,
                    "phase": "Between " + " and ".join(sorted([this_phase, next_phase])),
                }
                mode_df.loc[next_index] = row

        row = {
            "start_time_us": self.sys_metrics_df["timestamp_us"].min(),
            "end_time_us": mode_df["start_time_us"][0] - 1,
            "phase": "Before " + mode_df["phase"][0],
        }
        mode_df.loc[-1] = row
        mode_df = mode_df.sort_index().reset_index(drop=True)
        row = {
            "start_time_us": mode_df["end_time_us"][mode_df.index[-1]] + 1,
            "end_time_us": self.sys_metrics_df["timestamp_us"].max(),
            "phase": "After " + mode_df["phase"][mode_df.index[-1]],
        }
        mode_df.loc[mode_df.index[-1] + 1] = row
        return mode_df