def get_utilization_stats()

in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]


    def get_utilization_stats(self, resource=None, by=None, phase=None):
        """
        Get CPU/GPU utilization stats
        :param resource: system resource for which utilization stats have to be computed. Type: Resource
        :param by: By default, get overall utilization stats. When by="training_phase",
        utilization stats are provided per training phase interval. Type: StatsBy
        :param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
        are determined for all training phases available.
        :return: Dataframe containing utilization stats
        """
        if (by is not None) and (not isinstance(by, StatsBy)):
            get_logger().info(f"{by} should be of type StatsBy")
            return None
        if (resource is not None) and (not isinstance(resource, (list, Resource))):
            get_logger().info(f"{resource} should be of type list or Resource")
            return None

        if resource is None:
            resources = [
                Resource.CPU.value,
                Resource.GPU.value,
                Resource.MEMORY.value,
                Resource.IO.value,
                Resource.NETWORK.value,
            ]
        else:
            if isinstance(resource, Resource):
                resource = [resource]
            resources = [x.value for x in resource]

        if by == StatsBy.TRAINING_PHASE:
            interval_df = self.get_training_phase_intervals(phase)
            self._get_utilization_phase_by_time_interval(interval_df)

        df_for_concat = []
        columns = [
            "Resource",
            "nodeID",
            "utilization_mean",
            "utilization_min",
            "utilization_max",
            "utilization_p50",
            "utilization_p95",
            "utilization_p99",
        ]
        for resrc in resources:
            sys_resrc_df = self.sys_metrics_df[
                self.sys_metrics_df["type"].str.contains(resrc)
            ].reset_index()
            if sys_resrc_df.empty:
                # there's no data for this resource
                continue
            if by == StatsBy.TRAINING_PHASE:
                groupby = first_column_name = "phase"
            else:
                groupby = lambda _: resrc
                first_column_name = "level_0"

            sys_resrc_df = (
                sys_resrc_df.groupby([groupby, "nodeID"])["value"]
                .describe(percentiles=[0.5, 0.95, 0.99])
                .reset_index()
            )
            sys_resrc_df.columns.name = ""
            sys_resrc_df = sys_resrc_df.drop(["count", "std"], axis="columns")
            sys_resrc_df = sys_resrc_df[
                [first_column_name, "nodeID", "mean", "min", "max", "50%", "95%", "99%"]
            ]

            if by == StatsBy.TRAINING_PHASE:
                sys_resrc_df.insert(0, "Resource", resrc)

            df_for_concat.append(sys_resrc_df)

        if by == StatsBy.TRAINING_PHASE:
            columns.insert(1, "Training_phase")
        util_stats = pd.concat(df_for_concat).reset_index(drop=True)
        util_stats.columns = columns
        return util_stats