clusterloader2/testing/load/modules/network-policy/net-policy-metrics.yaml (111 lines of code) (raw):

# Valid actions: "start", "gather" {{$action := .action}} {{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} {{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} {{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} # CL2 params # Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} steps: - name: "{{$action}}ing network policy metrics" measurements: - Identifier: NetworkPolicyEnforcementLatency Method: GenericPrometheusQuery Params: action: {{$action}} metricName: "Network Policy Enforcement Latency" metricVersion: v1 unit: s queries: # Network policy enforcement metrics gathered from the test clients. {{if $usePolicyCreationMetrics}} - name: PolicyCreation - TargetCount query: sum(policy_enforcement_latency_policy_creation_seconds_count) - name: PolicyCreation - Perc50 query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) - name: PolicyCreation - Perc90 query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) - name: PolicyCreation - Perc95 query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) - name: PolicyCreation - Perc99 query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} {{end}} {{end}} {{if $usePodCreationMetrics}} - name: PodCreation - TargetCount query: sum(pod_creation_reachability_latency_seconds_count) - name: PodCreation - Perc50 query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) - name: PodCreation - Perc90 query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) - name: PodCreation - Perc95 query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) - name: PodCreation - Perc99 query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} {{end}} - name: PodIpAssignedLatency - TargetCount query: sum(pod_ip_address_assigned_latency_seconds_count) - name: PodIpAssignedLatency - Perc50 query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) - name: PodIpAssignedLatency - Perc90 query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) - name: PodIpAssignedLatency - Perc95 query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) - name: PodIpAssignedLatency - Perc99 query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} {{end}} {{end}} {{if $useCiliumMetrics}} - Identifier: NetworkPolicyMetrics Method: GenericPrometheusQuery Params: action: {{$action}} metricName: "Network Policy Performance" metricVersion: v1 unit: s queries: # Cilium agent metrics that are related to network policies. - name: Number of times a policy import has failed # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. # With that, this can be a percentage of failed imports. # https://github.com/cilium/cilium/pull/23349 query: sum(cilium_policy_import_errors_total) threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} - name: Failed endpoint regenerations percentage query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} - name: Policy regeneration time - Perc50 query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) - name: Policy regeneration time - Perc99 query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} {{end}} - name: Time between a policy change and it being fully deployed into the datapath - Perc50 query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) - name: Time between a policy change and it being fully deployed into the datapath - Perc99 query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) - name: Latency of policy update trigger - Perc50 query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) - name: Latency of policy update trigger - Perc99 query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) - name: Duration of policy update trigger - Perc50 query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) - name: Duration of policy update trigger - Perc99 query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) - name: Endpoint regeneration latency - Perc50 query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) - name: Endpoint regeneration latency - Perc99 query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} {{end}} - name: Number of policies currently loaded query: avg(cilium_policy) - name: Number of endpoints labeled by policy enforcement status query: sum(cilium_policy_endpoint_enforcement_status) {{end}}