pkg/exporter/probe/rdma/mellanox.go (51 lines of code) (raw):

package rdma import ( "strings" "github.com/alibaba/kubeskoop/pkg/exporter/probe" "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" ) const ( linkTypeMellanox = "mellanox_mlx5" ) var ( mlx5 = map[string]string{ "rx_write_requests": "The number of received WRITE requests for the associated QPs.", "rx_read_requests": "The number of received READ requests for the associated QPs.", "rx_atomic_requests": "The number of received ATOMIC request for the associated QPs.", "out_of_buffer": "The number of drops occurred due to lack of WQE for the associated QPs.", "out_of_sequence": "The number of out of sequence packets received.", "duplicate_request": "Number of duplicate request packets.", "rnr_nak_retry_err": "The number of received RNR NAK packets. The QP retry limit was not exceeded.", "packet_seq_err": "The number of received NAK sequence error packets. The QP retry limit was not exceeded.", "implied_nak_seq_err": "Number of time the requested decided an ACK with a PSN larger than the expected PSN for an RDMA read or response.", "local_ack_timeout_err": "The number of times QP's ack timer expired for RC, XRC, DCT QPs at the sender side.", "rx_dct_connect": "The number of received connection request for the associated DCTs.", "resp_local_length_error": "The number of times responder detected local length errors.", "resp_cqe_error": "The number of times responder detected CQEs completed with errors.", "req_cqe_error": "The number of times requester detected CQEs completed with errors.", "req_remote_invalid_request": "The number of times requester detected remote invalid request errors.", "req_remote_access_errors": "The number of times requester detected remote access errors.", "resp_remote_access_errors": "The number of times responder detected remote access errors.", "resp_cqe_flush_error": "The number of times responder detected CQEs completed with flushed errors.", "req_cqe_flush_error": "The number of times requester detected CQEs completed with flushed errors.", "roce_adp_retrans": "The number of adaptive retransmissions for RoCE traffic", "roce_adp_retrans_to": "The number of times RoCE traffic reached timeout due to adaptive retransmission", "roce_slow_restart": "The number of times RoCE slow restart was used", "roce_slow_restart_cnps": "The number of times RoCE slow restart generated CNP packets", "roce_slow_restart_trans": "The number of times RoCE slow restart changed state to slow restart", "rp_cnp_ignored": "The number of CNP packets received and ignored by the Reaction Point HCA.", "rp_cnp_handled": "The number of CNP packets handled by the Reaction Point HCA to throttle the transmission rate.", "np_ecn_marked_roce_packets": "The number of RoCEv2 packets received by the notification point which were marked for experiencing the congestion (ECN bits where '11' on the ingress RoCE traffic) .", "np_cnp_sent": "The number of CNP packets sent by the Notification Point when it noticed congestion experienced in the RoCEv2 IP header (ECN bits).", "rx_icrc_encapsulated": "The number of RoCE packets with ICRC errors.", } mlx5Metrics = lo.Map(lo.Keys(mlx5), func(k string, _ int) probe.SingleMetricsOpts { return probe.SingleMetricsOpts{ Name: strings.Join([]string{linkTypeMellanox, k}, "_"), VariableLabels: rdmaDevPortLabels, Help: mlx5[k], ValueType: prometheus.CounterValue, } }) )