in src/hpcadvisor/batch_handler.py [0:0]
def _get_monitoring_data(batch_client, poolid, jobid, taskid):
collection_attempts = 3
task = batch_client.task.get(jobid, taskid)
if task.state != batchmodels.TaskState.completed:
log.debug(f"task not completed state={task.state} taskid={taskid}")
return
credentials = DefaultAzureCredential()
subscription_id = env["SUBSCRIPTION"]
client = MonitorManagementClient(credentials, subscription_id)
# Azure monitor logs metrics by UTC time
start_time = str(task.execution_info.start_time).replace("+00:00", "")
end_time = str(task.execution_info.end_time).replace("+00:00", "")
task_duration = task.execution_info.end_time - task.execution_info.start_time
log.info(f"task duration={task_duration}")
resource_ids = get_vmss_batch_resource_ids(batch_client, poolid)
min_end_time = task.execution_info.start_time + timedelta(minutes=1)
cpu_usage_list = []
# avoid error for tasks that run less than 1 minute
# for client.metrics.list
if min_end_time < task.execution_info.end_time:
for resource_id in resource_ids:
log.debug(f"collecting cpu usage data for resource_id={resource_id}")
average_cpu = None
while average_cpu is None:
metrics_data = None
for i in range(collection_attempts):
log.debug(f"data collection: attempt {i+1}/{collection_attempts}")
try:
metrics_data = client.metrics.list(
resource_id,
timespan="{}/{}".format(start_time, end_time),
interval="PT1M",
metricnames="Percentage CPU",
aggregation="average",
)
break
except Exception as e:
log.error(f"Error getting metrics data: {e}")
time.sleep(5)
continue
log.debug(f"Metric data: {metrics_data}")
average_cpu = _get_average_value(metrics_data)
log.debug(f"average_cpu={average_cpu}")
if average_cpu is None:
log.debug("average_cpu is None, retrying...")
cpu_usage_list.append(float(average_cpu))
else:
log.warn(
f"task {taskid} duration is less than 1 minute, skipping cpu usage check"
)
return cpu_usage_list