def _get_monitoring_data()

in src/hpcadvisor/batch_handler.py [0:0]


def _get_monitoring_data(batch_client, poolid, jobid, taskid):
    collection_attempts = 3
    task = batch_client.task.get(jobid, taskid)

    if task.state != batchmodels.TaskState.completed:
        log.debug(f"task not completed state={task.state} taskid={taskid}")
        return

    credentials = DefaultAzureCredential()
    subscription_id = env["SUBSCRIPTION"]
    client = MonitorManagementClient(credentials, subscription_id)

    # Azure monitor logs metrics by UTC time

    start_time = str(task.execution_info.start_time).replace("+00:00", "")
    end_time = str(task.execution_info.end_time).replace("+00:00", "")

    task_duration = task.execution_info.end_time - task.execution_info.start_time
    log.info(f"task duration={task_duration}")

    resource_ids = get_vmss_batch_resource_ids(batch_client, poolid)

    min_end_time = task.execution_info.start_time + timedelta(minutes=1)

    cpu_usage_list = []
    # avoid error for tasks that run less than 1 minute
    # for client.metrics.list
    if min_end_time < task.execution_info.end_time:
        for resource_id in resource_ids:
            log.debug(f"collecting cpu usage data for resource_id={resource_id}")
            average_cpu = None
            while average_cpu is None:
                metrics_data = None
                for i in range(collection_attempts):
                    log.debug(f"data collection: attempt {i+1}/{collection_attempts}")
                    try:
                        metrics_data = client.metrics.list(
                            resource_id,
                            timespan="{}/{}".format(start_time, end_time),
                            interval="PT1M",
                            metricnames="Percentage CPU",
                            aggregation="average",
                        )
                        break
                    except Exception as e:
                        log.error(f"Error getting metrics data: {e}")
                        time.sleep(5)
                        continue

                log.debug(f"Metric data: {metrics_data}")
                average_cpu = _get_average_value(metrics_data)
                log.debug(f"average_cpu={average_cpu}")
                if average_cpu is None:
                    log.debug("average_cpu is None, retrying...")

            cpu_usage_list.append(float(average_cpu))
    else:
        log.warn(
            f"task {taskid} duration is less than 1 minute, skipping cpu usage check"
        )

    return cpu_usage_list