in server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java [368:535]
private static void setUpMetrics(ShuffleServerConf serverConf) {
counterTotalReceivedDataSize = metricsManager.addLabeledCounter(TOTAL_RECEIVED_DATA);
counterTotalWriteDataSize = metricsManager.addLabeledCounter(TOTAL_WRITE_DATA);
counterTotalDeleteDataSize = metricsManager.addLabeledCounter(TOTAL_DELETE_DATA);
counterTotalFlushFileNum = metricsManager.addLabeledCounter(TOTAL_FLUSH_FILE_NUM);
counterTotalDeleteFileNum = metricsManager.addLabeledCounter(TOTAL_DELETE_FILE_NUM);
gaugeStorageUsedBytes = metricsManager.addLabeledGauge(STORAGE_USED_BYTES);
gaugeFlushFileNum = metricsManager.addLabeledGauge(FLUSH_FILE_NUM);
counterTotalWriteBlockSize = metricsManager.addLabeledCounter(TOTAL_WRITE_BLOCK);
appHistogramWriteBlockSize =
metricsManager.addHistogram(
WRITE_BLOCK_SIZE,
ConfigUtils.convertBytesStringToDoubleArray(
serverConf.get(ShuffleServerConf.APP_LEVEL_SHUFFLE_BLOCK_SIZE_METRIC_BUCKETS)),
METRICS_APP_LABEL_NAME);
counterTotalWriteTime = metricsManager.addLabeledCounter(TOTAL_WRITE_TIME);
counterWriteException = metricsManager.addLabeledCounter(TOTAL_WRITE_EXCEPTION);
counterWriteSlow = metricsManager.addLabeledCounter(TOTAL_WRITE_SLOW);
counterWriteTotal = metricsManager.addLabeledCounter(TOTAL_WRITE_NUM);
counterEventSizeThresholdLevel1 = metricsManager.addLabeledCounter(EVENT_SIZE_THRESHOLD_LEVEL1);
counterEventSizeThresholdLevel2 = metricsManager.addLabeledCounter(EVENT_SIZE_THRESHOLD_LEVEL2);
counterEventSizeThresholdLevel3 = metricsManager.addLabeledCounter(EVENT_SIZE_THRESHOLD_LEVEL3);
counterEventSizeThresholdLevel4 = metricsManager.addLabeledCounter(EVENT_SIZE_THRESHOLD_LEVEL4);
counterTotalReadDataSize = metricsManager.addLabeledCounter(TOTAL_READ_DATA);
counterTotalReadLocalDataFileSize =
metricsManager.addLabeledCounter(TOTAL_READ_LOCAL_DATA_FILE);
counterTotalReadLocalIndexFileSize =
metricsManager.addLabeledCounter(TOTAL_READ_LOCAL_INDEX_FILE);
counterTotalReadMemoryDataSize = metricsManager.addLabeledCounter(TOTAL_READ_MEMORY_DATA);
counterTotalReadTime = metricsManager.addLabeledCounter(TOTAL_READ_TIME);
counterTotalDroppedEventNum = metricsManager.addLabeledCounter(TOTAL_DROPPED_EVENT_NUM);
counterTotalFailedWrittenEventNum =
metricsManager.addLabeledCounter(TOTAL_FAILED_WRITTEN_EVENT_NUM);
counterTotalHadoopWriteDataSize =
metricsManager.addCounter(
TOTAL_HADOOP_WRITE_DATA, Constants.METRICS_TAG_LABEL_NAME, STORAGE_HOST_LABEL);
counterTotalHadoopWriteDataSizeForHugePartition =
metricsManager.addCounter(
TOTAL_HADOOP_WRITE_DATA_FOR_HUGE_PARTITION,
Constants.METRICS_TAG_LABEL_NAME,
STORAGE_HOST_LABEL);
counterTotalLocalFileWriteDataSize =
metricsManager.addCounter(TOTAL_LOCALFILE_WRITE_DATA, LOCAL_DISK_PATH_LABEL);
counterTotalRequireBufferFailed = metricsManager.addLabeledCounter(TOTAL_REQUIRE_BUFFER_FAILED);
counterTotalRequireBufferFailedForRegularPartition =
metricsManager.addLabeledCounter(TOTAL_REQUIRE_BUFFER_FAILED_FOR_REGULAR_PARTITION);
counterTotalRequireBufferFailedForHugePartition =
metricsManager.addLabeledCounter(TOTAL_REQUIRE_BUFFER_FAILED_FOR_HUGE_PARTITION);
counterLocalStorageTotalWrite = metricsManager.addLabeledCounter(STORAGE_TOTAL_WRITE_LOCAL);
counterLocalStorageRetryWrite = metricsManager.addLabeledCounter(STORAGE_RETRY_WRITE_LOCAL);
counterLocalStorageFailedWrite = metricsManager.addLabeledCounter(STORAGE_FAILED_WRITE_LOCAL);
counterLocalStorageSuccessWrite = metricsManager.addLabeledCounter(STORAGE_SUCCESS_WRITE_LOCAL);
counterRemoteStorageTotalWrite =
metricsManager.addCounter(
STORAGE_TOTAL_WRITE_REMOTE, Constants.METRICS_TAG_LABEL_NAME, STORAGE_HOST_LABEL);
counterRemoteStorageRetryWrite =
metricsManager.addCounter(
STORAGE_RETRY_WRITE_REMOTE, Constants.METRICS_TAG_LABEL_NAME, STORAGE_HOST_LABEL);
counterRemoteStorageFailedWrite =
metricsManager.addCounter(
STORAGE_FAILED_WRITE_REMOTE, Constants.METRICS_TAG_LABEL_NAME, STORAGE_HOST_LABEL);
counterRemoteStorageSuccessWrite =
metricsManager.addCounter(
STORAGE_SUCCESS_WRITE_REMOTE, Constants.METRICS_TAG_LABEL_NAME, STORAGE_HOST_LABEL);
counterTotalRequireReadMemoryNum = metricsManager.addLabeledCounter(TOTAL_REQUIRE_READ_MEMORY);
counterTotalRequireReadMemoryRetryNum =
metricsManager.addLabeledCounter(TOTAL_REQUIRE_READ_MEMORY_RETRY);
counterTotalRequireReadMemoryFailedNum =
metricsManager.addLabeledCounter(TOTAL_REQUIRE_READ_MEMORY_FAILED);
counterTotalAppNum = metricsManager.addLabeledCounter(TOTAL_APP_NUM);
counterTotalAppWithHugePartitionNum =
metricsManager.addLabeledCounter(TOTAL_APP_WITH_HUGE_PARTITION_NUM);
counterTotalPartitionNum = metricsManager.addLabeledCounter(TOTAL_PARTITION_NUM);
counterTotalHugePartitionNum = metricsManager.addLabeledCounter(TOTAL_HUGE_PARTITION_NUM);
counterTotalHugePartitionExceedHardLimitNum =
metricsManager.addLabeledCounter(TOTAL_HUGE_PARTITION_EXCEED_HARD_LIMIT_NUM);
counterLocalRenameAndDeletionFaileTd =
metricsManager.addLabeledCounter(TOTAL_LOCAL_RENAME_AND_DELETION_FAILED);
gaugeLocalStorageIsWritable =
metricsManager.addGauge(LOCAL_STORAGE_IS_WRITABLE, LOCAL_DISK_PATH_LABEL);
gaugeLocalStorageIsTimeout =
metricsManager.addGauge(LOCAL_STORAGE_IS_TIMEOUT, LOCAL_DISK_PATH_LABEL);
gaugeLocalStorageTotalDirsNum = metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
gaugeLocalStorageCorruptedDirsNum =
metricsManager.addLabeledGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
gaugeLocalStorageTotalSpace = metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_SPACE);
gaugeLocalStorageWholeDiskUsedSpace =
metricsManager.addLabeledGauge(LOCAL_STORAGE_WHOLE_DISK_USED_SPACE);
gaugeLocalStorageServiceUsedSpace =
metricsManager.addLabeledGauge(LOCAL_STORAGE_SERVICE_USED_SPACE);
gaugeLocalStorageUsedSpaceRatio =
metricsManager.addLabeledGauge(LOCAL_STORAGE_USED_SPACE_RATIO);
gaugeIsHealthy = metricsManager.addLabeledGauge(IS_HEALTHY);
gaugeAllocatedBufferSize = metricsManager.addLabeledGauge(ALLOCATED_BUFFER_SIZE);
gaugeInFlushBufferSize = metricsManager.addLabeledGauge(IN_FLUSH_BUFFER_SIZE);
gaugeUsedBufferSize = metricsManager.addLabeledGauge(USED_BUFFER_SIZE);
gaugeReadBufferUsedSize = metricsManager.addLabeledGauge(READ_USED_BUFFER_SIZE);
gaugeWriteHandler = metricsManager.addLabeledGauge(TOTAL_WRITE_HANDLER);
gaugeMergeEventQueueSize = metricsManager.addLabeledGauge(MERGE_EVENT_QUEUE_SIZE);
gaugeHadoopFlushThreadPoolQueueSize =
metricsManager.addLabeledGauge(HADOOP_FLUSH_THREAD_POOL_QUEUE_SIZE);
gaugeLocalfileFlushThreadPoolQueueSize =
metricsManager.addLabeledGauge(LOCALFILE_FLUSH_THREAD_POOL_QUEUE_SIZE);
gaugeFallbackFlushThreadPoolQueueSize =
metricsManager.addLabeledGauge(FALLBACK_FLUSH_THREAD_POOL_QUEUE_SIZE);
gaugeAppNum = metricsManager.addLabeledGauge(APP_NUM_WITH_NODE);
gaugeTotalPartitionNum = metricsManager.addLabeledGauge(PARTITION_NUM_WITH_NODE);
gaugeReadLocalDataFileThreadNum =
metricsManager.addLabeledGauge(READ_LOCAL_DATA_FILE_THREAD_NUM);
gaugeReadLocalIndexFileThreadNum =
metricsManager.addLabeledGauge(READ_LOCAL_INDEX_FILE_THREAD_NUM);
gaugeReadMemoryDataThreadNum = metricsManager.addLabeledGauge(READ_MEMORY_DATA_THREAD_NUM);
gaugeReadLocalDataFileBufferSize =
metricsManager.addLabeledGauge(READ_LOCAL_DATA_FILE_BUFFER_SIZE);
gaugeReadLocalIndexFileBufferSize =
metricsManager.addLabeledGauge(READ_LOCAL_INDEX_FILE_BUFFER_SIZE);
gaugeReadMemoryDataBufferSize = metricsManager.addLabeledGauge(READ_MEMORY_DATA_BUFFER_SIZE);
gaugeHugePartitionNum = metricsManager.addLabeledGauge(HUGE_PARTITION_NUM);
gaugeAppWithHugePartitionNum = metricsManager.addLabeledGauge(APP_WITH_HUGE_PARTITION_NUM);
counterLocalFileEventFlush = metricsManager.addCounter(LOCAL_FILE_EVENT_FLUSH_NUM);
counterHadoopEventFlush = metricsManager.addCounter(HADOOP_EVENT_FLUSH_NUM);
counterPreAllocatedBufferExpired =
metricsManager.addCounter(TOTAL_EXPIRED_PRE_ALLOCATED_BUFFER_NUM);
counterAppNotFound = metricsManager.addCounter(TOTAL_APP_NOT_FOUND_NUM);
summaryTotalRemoveResourceTime = metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_TIME);
summaryTotalRemoveResourceByShuffleIdsTime =
metricsManager.addSummary(TOTAL_REMOVE_RESOURCE_BY_SHUFFLE_IDS_TIME);
gaugeTotalDataSizeUsage =
Gauge.build()
.name(TOPN_OF_TOTAL_DATA_SIZE_FOR_APP)
.help("top N of total shuffle data size for app level")
.labelNames("app_id")
.register(metricsManager.getCollectorRegistry());
gaugeInMemoryDataSizeUsage =
Gauge.build()
.name(TOPN_OF_IN_MEMORY_DATA_SIZE_FOR_APP)
.help("top N of in memory shuffle data size for app level")
.labelNames("app_id")
.register(metricsManager.getCollectorRegistry());
gaugeOnDiskDataSizeUsage =
Gauge.build()
.name(TOPN_OF_ON_LOCALFILE_DATA_SIZE_FOR_APP)
.help("top N of on disk shuffle data size for app level")
.labelNames("app_id")
.register(metricsManager.getCollectorRegistry());
gaugeOnHadoopDataSizeUsage =
Gauge.build()
.name(TOPN_OF_ON_HADOOP_DATA_SIZE_FOR_APP)
.help("top N of on hadoop shuffle data size for app level")
.labelNames("app_id")
.register(metricsManager.getCollectorRegistry());
}