software/components/observability/subnet_monitoring.yaml (55 lines of code) (raw):
apiVersion: v1
kind: ConfigMap
metadata:
name: container-azm-ms-agentconfig
namespace: kube-system
data:
schema-version:
#string.used by agent to parse config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent.
v1
config-version:
#string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated)
ver1
log-data-collection-settings: |-
# Log data collection settings
# Any errors related to config map settings can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
[log_collection_settings]
[log_collection_settings.stdout]
# In the absense of this configmap, default value for enabled is true
enabled = true
# exclude_namespaces setting holds good only if enabled is set to true
# kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting.
# If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array.
# In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"]
exclude_namespaces = ["kube-system","gatekeeper-system"]
# If you want to collect logs from only selective pods inside system namespaces add them to the following setting. Provide namepace:controllerName of the system pod. NOTE: this setting is only for pods in system namespaces
# Valid values for system namespaces are: kube-system, azure-arc, gatekeeper-system, kube-public, kube-node-lease, calico-system. The system namespace used should not be present in exclude_namespaces
# collect_system_pod_logs = ["kube-system:coredns"]
[log_collection_settings.stderr]
# Default value for enabled is true
enabled = true
# exclude_namespaces setting holds good only if enabled is set to true
# kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting.
# If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array.
# In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"]
exclude_namespaces = ["kube-system","gatekeeper-system"]
# If you want to collect logs from only selective pods inside system namespaces add them to the following setting. Provide namepace:controllerName of the system pod. NOTE: this setting is only for pods in system namespaces
# Valid values for system namespaces are: kube-system, azure-arc, gatekeeper-system, kube-public, kube-node-lease, calico-system. The system namespace used should not be present in exclude_namespaces
# collect_system_pod_logs = ["kube-system:coredns"]
[log_collection_settings.env_var]
# In the absense of this configmap, default value for enabled is true
enabled = true
[log_collection_settings.enrich_container_logs]
# In the absense of this configmap, default value for enrich_container_logs is false
enabled = false
# When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image
[log_collection_settings.collect_all_kube_events]
# In the absense of this configmap, default value for collect_all_kube_events is false
# When the setting is set to false, only the kube events with !normal event type will be collected
enabled = false
# When this is enabled (enabled = true), all kube events including normal events will be collected
[log_collection_settings.schema]
# In the absence of this configmap, default value for containerlog_schema_version is "v1" if "v2" is not enabled while onboarding
# Supported values for this setting are "v1","v2"
# See documentation at https://aka.ms/ContainerLogv2 for benefits of v2 schema over v1 schema
containerlog_schema_version = "v2"
#[log_collection_settings.enable_multiline_logs]
# fluent-bit based multiline log collection for .NET, Go, Java, and Python stacktraces. Update stacktrace_languages to specificy which languages to collect stacktraces for(valid inputs: "go", "java", "python", "dotnet").
# NOTE: for better performance consider enabling only for languages that are needed. Dotnet is experimental and may not work in all cases.
# If enabled will also stitch together container logs split by docker/cri due to size limits(16KB per log line) up to 64 KB.
# Requires ContainerLogV2 schema to be enabled. See https://aka.ms/ContainerLogv2 for more details.
# enabled = "false"
# stacktrace_languages = []
#[log_collection_settings.metadata_collection]
# kube_meta_cache_ttl_secs is a configurable option for K8s cached metadata. Default is 60s. You may adjust it in below section [agent_settings.k8s_metadata_config]. Reference link: https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#configuration-parameters
# if enabled will collect kubernetes metadata for ContainerLogv2 schema. Default is false.
# enabled = false
# if include_fields commented out or empty, all fields will be included. If include_fields is set, only the fields listed will be included.
# include_fields = ["podLabels","podAnnotations","podUid","image","imageID","imageRepo","imageTag"]
#[log_collection_settings.filter_using_annotations]
# if enabled will exclude logs from pods with annotations fluenbit.io/exclude: "true".
# Read more: https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#kubernetes-annotations
# enabled = false
prometheus-data-collection-settings: |-
# Custom Prometheus metrics data collection settings
[prometheus_data_collection_settings.cluster]
# Cluster level scrape endpoint(s). These metrics will be scraped from agent's Replicaset (singleton)
# Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
#Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h.
interval = "1m"
## Uncomment the following settings with valid string arrays for prometheus scraping
#fieldpass = ["metric_to_pass1", "metric_to_pass12"]
#fielddrop = ["metric_to_drop"]
# An array of urls to scrape metrics from.
# urls = ["http://myurl:9101/metrics"]
# An array of Kubernetes services to scrape metrics from.
# kubernetes_services = ["http://my-service-dns.my-namespace:9102/metrics"]
# When monitor_kubernetes_pods = true, prometheus sidecar container will scrape Kubernetes pods for the following prometheus annotations:
# - prometheus.io/scrape: Enable scraping for this pod
# - prometheus.io/scheme: Default is http
# - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation.
# - prometheus.io/port: If port is not 9102 use this annotation
monitor_kubernetes_pods = false
## Restricts Kubernetes monitoring to namespaces for pods that have annotations set and are scraped using the monitor_kubernetes_pods setting.
## This will take effect when monitor_kubernetes_pods is set to true
## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"]
# monitor_kubernetes_pods_namespaces = ["default1"]
## Label selector to target pods which have the specified label
## This will take effect when monitor_kubernetes_pods is set to true
## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors
# kubernetes_label_selector = "env=dev,app=nginx"
## Field selector to target pods which have the specified field
## This will take effect when monitor_kubernetes_pods is set to true
## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/
## eg. To scrape pods on a specific node
# kubernetes_field_selector = "spec.nodeName=$HOSTNAME"
[prometheus_data_collection_settings.node]
# Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster
# Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
#Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h.
interval = "1m"
## Uncomment the following settings with valid string arrays for prometheus scraping
# An array of urls to scrape metrics from. $NODE_IP (all upper case) will substitute of running Node's IP address
# urls = ["http://$NODE_IP:9103/metrics"]
#fieldpass = ["metric_to_pass1", "metric_to_pass12"]
#fielddrop = ["metric_to_drop"]
metric_collection_settings: |-
# Metrics collection settings for metrics sent to Log Analytics and MDM
[metric_collection_settings.collect_kube_system_pv_metrics]
# In the absense of this configmap, default value for collect_kube_system_pv_metrics is false
# When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected
enabled = false
# When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected
alertable-metrics-configuration-settings: |-
# Alertable metrics configuration settings for container resource utilization
[alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
# The threshold(Type Float) will be rounded off to 2 decimal points
# Threshold for container cpu, metric will be sent only when cpu utilization exceeds or becomes equal to the following percentage
container_cpu_threshold_percentage = 95.0
# Threshold for container memoryRss, metric will be sent only when memory rss exceeds or becomes equal to the following percentage
container_memory_rss_threshold_percentage = 95.0
# Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage
container_memory_working_set_threshold_percentage = 95.0
# Alertable metrics configuration settings for persistent volume utilization
[alertable_metrics_configuration_settings.pv_utilization_thresholds]
# Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
pv_usage_threshold_percentage = 60.0
# Alertable metrics configuration settings for completed jobs count
[alertable_metrics_configuration_settings.job_completion_threshold]
# Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold
job_completion_threshold_time_minutes = 360
integrations: |-
[integrations.azure_network_policy_manager]
collect_basic_metrics = false
collect_advanced_metrics = false
[integrations.azure_subnet_ip_usage]
enabled = false
# Doc - https://github.com/microsoft/Docker-Provider/blob/ci_prod/Documentation/AgentSettings/ReadMe.md
agent-settings: |-
# prometheus scrape fluent bit settings for high scale
# buffer size should be greater than or equal to chunk size else we set it to chunk size.
# settings scoped to prometheus sidecar container. all values in mb
[agent_settings.prometheus_fbit_settings]
tcp_listener_chunk_size = 10
tcp_listener_buffer_size = 10
tcp_listener_mem_buf_limit = 200
# prometheus scrape fluent bit settings for high scale
# buffer size should be greater than or equal to chunk size else we set it to chunk size.
# settings scoped to daemonset container. all values in mb
# [agent_settings.node_prometheus_fbit_settings]
# tcp_listener_chunk_size = 1
# tcp_listener_buffer_size = 1
# tcp_listener_mem_buf_limit = 10
# prometheus scrape fluent bit settings for high scale
# buffer size should be greater than or equal to chunk size else we set it to chunk size.
# settings scoped to replicaset container. all values in mb
# [agent_settings.cluster_prometheus_fbit_settings]
# tcp_listener_chunk_size = 1
# tcp_listener_buffer_size = 1
# tcp_listener_mem_buf_limit = 10
# The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
# They increase the maximum stdout/stderr log collection rate but will also cause higher cpu/memory usage.
## Ref for more details about Ignore_Older - https://docs.fluentbit.io/manual/v/1.7/pipeline/inputs/tail
# [agent_settings.fbit_config]
# log_flush_interval_secs = "1" # default value is 15
# tail_mem_buf_limit_megabytes = "10" # default value is 10
# tail_buf_chunksize_megabytes = "1" # default value is 32kb (comment out this line for default)
# tail_buf_maxsize_megabytes = "1" # default value is 32kb (comment out this line for default)
# tail_ignore_older = "5m" # default value same as fluent-bit default i.e.0m
# On both AKS & Arc K8s enviornments, if Cluster has configured with Forward Proxy then Proxy settings automatically applied and used for the agent
# Certain configurations, proxy config should be ignored for example Cluster with AMPLS + Proxy
# in such scenarios, use the following config to ignore proxy settings
# [agent_settings.proxy_config]
# ignore_proxy_settings = "true" # if this is not applied, default value is false
# Disables fluent-bit for perf and container inventory for Windows
#[agent_settings.windows_fluent_bit]
# disabled = "true"
# The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
# Configuration settings for the waittime for the network listeners to be available
# [agent_settings.network_listener_waittime]
# tcp_port_25226 = 45 # Port 25226 is used for telegraf to fluent-bit data in ReplicaSet
# tcp_port_25228 = 60 # Port 25228 is used for telegraf to fluentd data
# tcp_port_25229 = 45 # Port 25229 is used for telegraf to fluent-bit data in DaemonSet
# The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
# [agent_settings.mdsd_config]
# monitoring_max_event_rate = "50000" # default 20K eps
# backpressure_memory_threshold_in_mb = "1500" # default 3500MB
# upload_max_size_in_mb = "20" # default 2MB
# upload_frequency_seconds = "1" # default 60 upload_frequency_seconds
# compression_level = "0" # supported levels 0 to 9 and 0 means no compression
# The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
# [agent_settings.telemetry_config]
# disable_telemetry = false # if this is not applied, default value is false
# The following setting is for KubernetesMetadata CacheTTL Settings.
# [agent_settings.k8s_metadata_config]
# kube_meta_cache_ttl_secs = 60 # if this is not applied, default value is 60s