in smdebug/profiler/analysis/utils/pandas_data_analysis.py [0:0]
def get_utilization_stats(self, resource=None, by=None, phase=None):
"""
Get CPU/GPU utilization stats
:param resource: system resource for which utilization stats have to be computed. Type: Resource
:param by: By default, get overall utilization stats. When by="training_phase",
utilization stats are provided per training phase interval. Type: StatsBy
:param phase: List of training phase to find intervals for. If nothing is mentioned, intervals
are determined for all training phases available.
:return: Dataframe containing utilization stats
"""
if (by is not None) and (not isinstance(by, StatsBy)):
get_logger().info(f"{by} should be of type StatsBy")
return None
if (resource is not None) and (not isinstance(resource, (list, Resource))):
get_logger().info(f"{resource} should be of type list or Resource")
return None
if resource is None:
resources = [
Resource.CPU.value,
Resource.GPU.value,
Resource.MEMORY.value,
Resource.IO.value,
Resource.NETWORK.value,
]
else:
if isinstance(resource, Resource):
resource = [resource]
resources = [x.value for x in resource]
if by == StatsBy.TRAINING_PHASE:
interval_df = self.get_training_phase_intervals(phase)
self._get_utilization_phase_by_time_interval(interval_df)
df_for_concat = []
columns = [
"Resource",
"nodeID",
"utilization_mean",
"utilization_min",
"utilization_max",
"utilization_p50",
"utilization_p95",
"utilization_p99",
]
for resrc in resources:
sys_resrc_df = self.sys_metrics_df[
self.sys_metrics_df["type"].str.contains(resrc)
].reset_index()
if sys_resrc_df.empty:
# there's no data for this resource
continue
if by == StatsBy.TRAINING_PHASE:
groupby = first_column_name = "phase"
else:
groupby = lambda _: resrc
first_column_name = "level_0"
sys_resrc_df = (
sys_resrc_df.groupby([groupby, "nodeID"])["value"]
.describe(percentiles=[0.5, 0.95, 0.99])
.reset_index()
)
sys_resrc_df.columns.name = ""
sys_resrc_df = sys_resrc_df.drop(["count", "std"], axis="columns")
sys_resrc_df = sys_resrc_df[
[first_column_name, "nodeID", "mean", "min", "max", "50%", "95%", "99%"]
]
if by == StatsBy.TRAINING_PHASE:
sys_resrc_df.insert(0, "Resource", resrc)
df_for_concat.append(sys_resrc_df)
if by == StatsBy.TRAINING_PHASE:
columns.insert(1, "Training_phase")
util_stats = pd.concat(df_for_concat).reset_index(drop=True)
util_stats.columns = columns
return util_stats