pkg/exporter/probe/procnetstat/procnetstat.go (180 lines of code) (raw):

package procnetstat import ( "bufio" "context" "fmt" "io" "os" "strconv" "strings" "github.com/alibaba/kubeskoop/pkg/exporter/probe" log "github.com/sirupsen/logrus" "github.com/alibaba/kubeskoop/pkg/exporter/nettop" ) const ( probeName = "tcpext" // nolint ProtocolTCPExt = "tcpext" TCPActiveOpens = "activeopens" TCPPassiveOpens = "passiveopens" TCPRetransSegs = "retranssegs" TCPListenDrops = "listendrops" TCPListenOverflows = "listenoverflows" TCPSynRetrans = "tcpsynretrans" TCPFastRetrans = "tcpfastretrans" TCPRetransFail = "tcpretransfail" TCPTimeouts = "tcptimeouts" TCPAbortOnClose = "tcpabortonclose" TCPAbortOnMemory = "tcpabortonmemory" TCPAbortOnTimeout = "tcpabortontimeout" TCPAbortOnLinger = "tcpabortonlinger" TCPAbortOnData = "tcpabortondata" TCPAbortFailed = "tcpabortfailed" TCPACKSkippedSynRecv = "tcpackskippedsynrecv" TCPACKSkippedPAWS = "tcpackskippedpaws" TCPACKSkippedSeq = "tcpackskippedseq" TCPACKSkippedFinWait2 = "tcpackskippedfinwait2" TCPACKSkippedTimeWait = "tcpackskippedtimewait" TCPACKSkippedChallenge = "tcpackskippedchallenge" TCPRcvQDrop = "tcprcvqdrop" PAWSActive = "pawsactive" PAWSEstab = "pawsestab" EmbryonicRsts = "embryonicrsts" TCPWinProbe = "tcpwinprobe" TCPKeepAlive = "tcpkeepalive" TCPMTUPFail = "tcpmtupfail" TCPMTUPSuccess = "tcpmtupsuccess" TCPZeroWindowDrop = "tcpzerowindowdrop" TCPBacklogDrop = "tcpbacklogdrop" PFMemallocDrop = "pfmemallocdrop" TCPWqueueTooBig = "tcpwqueuetoobig" TCPMemoryPressures = "tcpmemorypressures" TCPMemoryPressuresChrono = "tcpmemorypressureschrono" ) var ( TCPExtMetrics = []probe.LegacyMetric{ {Name: TCPListenDrops, Help: "The total number of TCP connection requests that were dropped because the listen queue was full."}, {Name: TCPListenOverflows, Help: "The total number of times the TCP listen queue has overflown."}, {Name: TCPSynRetrans, Help: "The total number of SYN packets that were retransmitted."}, {Name: TCPFastRetrans, Help: "The total number of fast retransmissions made by TCP."}, {Name: TCPRetransFail, Help: "The total number of failed retransmissions in TCP."}, {Name: TCPTimeouts, Help: "The total number of TCP timeouts."}, {Name: TCPAbortOnClose, Help: "The number of TCP connections that were aborted on close."}, {Name: TCPAbortOnMemory, Help: "The number of TCP connections that were aborted due to memory allocation failures."}, {Name: TCPAbortOnTimeout, Help: "The number of TCP connections that were aborted due to timeouts."}, {Name: TCPAbortOnLinger, Help: "The number of TCP connections that were aborted due to linger timeouts."}, {Name: TCPAbortOnData, Help: "The number of TCP connections that were aborted due to data-related issues."}, {Name: TCPAbortFailed, Help: "The number of attempts to abort TCP connections that failed."}, {Name: TCPACKSkippedSynRecv, Help: "The number of ACKs skipped while in SYN_RECV state."}, {Name: TCPACKSkippedPAWS, Help: "The number of ACKs skipped due to PAWS (Protection Against Wrapped Sequence numbers)."}, {Name: TCPACKSkippedSeq, Help: "The number of ACKs skipped due to sequence number issues."}, {Name: TCPACKSkippedFinWait2, Help: "The number of ACKs skipped while in FIN_WAIT_2 state."}, {Name: TCPACKSkippedTimeWait, Help: "The number of ACKs skipped while in TIME_WAIT state."}, {Name: TCPACKSkippedChallenge, Help: "The number of ACKs skipped due to challenges in the communication."}, {Name: TCPRcvQDrop, Help: "The total number of received packets that were dropped due to queue overflow."}, {Name: TCPMemoryPressures, Help: "The total number of occasions where the TCP stack experienced memory pressure."}, {Name: TCPMemoryPressuresChrono, Help: "Chronological count of TCP memory pressure events."}, {Name: PAWSActive, Help: "Indicates whether the PAWS mechanism is active."}, {Name: PAWSEstab, Help: "The number of established connections utilizing PAWS."}, {Name: EmbryonicRsts, Help: "The number of embryonic (half-open) connections that were reset."}, {Name: TCPWinProbe, Help: "The total number of window probes sent to check for window size."}, {Name: TCPKeepAlive, Help: "The total number of TCP keepalive packets sent."}, {Name: TCPMTUPFail, Help: "The total number of MTU (Maximum Transmission Unit) probe failures."}, {Name: TCPMTUPSuccess, Help: "The total number of successful MTU (Maximum Transmission Unit) discoveries."}, {Name: TCPZeroWindowDrop, Help: "The total number of packets dropped due to a zero window condition."}, {Name: TCPBacklogDrop, Help: "The total number of packets dropped from the TCP backlog queue."}, {Name: PFMemallocDrop, Help: "The total number of packets dropped due to PF_MEMALLOC allocations failing."}, {Name: TCPWqueueTooBig, Help: "The total number of TCP send queue drops due to the queue being too large."}, } ) func init() { probe.MustRegisterMetricsProbe(probeName, netdevProbeCreator) } func netdevProbeCreator() (probe.MetricsProbe, error) { p := &ProcNetstat{} batchMetrics := probe.NewLegacyBatchMetrics(probeName, TCPExtMetrics, p.CollectOnce) return probe.NewMetricsProbe(probeName, p, batchMetrics), nil } type ProcNetstat struct { } func (s *ProcNetstat) Start(_ context.Context) error { return nil } func (s *ProcNetstat) Stop(_ context.Context) error { return nil } func (s *ProcNetstat) CollectOnce() (map[string]map[uint32]uint64, error) { ets := nettop.GetAllUniqueNetnsEntity() if len(ets) == 0 { log.Errorf("%s error, no entity found", probeName) } return collect(ets) } func collect(nslist []*nettop.Entity) (map[string]map[uint32]uint64, error) { resMap := make(map[string]map[uint32]uint64) for _, stat := range TCPExtMetrics { resMap[stat.Name] = make(map[uint32]uint64) } for _, et := range nslist { stats, err := getNetstatByPid(uint32(et.GetPid())) if err != nil { log.Errorf("%s failed collect pid %d, err: %v", probeName, et.GetPid(), err) continue } extstats := stats[ProtocolTCPExt] for _, stat := range TCPExtMetrics { if _, ok := extstats[stat.Name]; ok { data, err := strconv.ParseUint(extstats[stat.Name], 10, 64) if err != nil { log.Errorf("%s failed parse stat %s, pid: %d err: %v", probeName, stat, et.GetPid(), err) continue } resMap[stat.Name][uint32(et.GetNetns())] += data } } } return resMap, nil } func getNetstatByPid(pid uint32) (map[string]map[string]string, error) { resMap := make(map[string]map[string]string) netstatpath := fmt.Sprintf("/proc/%d/net/netstat", pid) if _, err := os.Stat(netstatpath); os.IsNotExist(err) { return resMap, err } netStats, err := getNetStats(netstatpath) if err != nil { return resMap, err } for k, v := range netStats { resMap[k] = v } return resMap, nil } func getNetStats(fileName string) (map[string]map[string]string, error) { file, err := os.Open(fileName) if err != nil { return nil, err } defer file.Close() return parseNetStats(file, fileName) } func parseNetStats(r io.Reader, fileName string) (map[string]map[string]string, error) { var ( netStats = map[string]map[string]string{} scanner = bufio.NewScanner(r) ) for scanner.Scan() { nameParts := strings.Split(scanner.Text(), " ") scanner.Scan() valueParts := strings.Split(scanner.Text(), " ") // Remove trailing :. protocol := strings.ToLower(nameParts[0][:len(nameParts[0])-1]) netStats[protocol] = map[string]string{} if len(nameParts) != len(valueParts) { return nil, fmt.Errorf("mismatch field count mismatch in %s: %s", fileName, protocol) } for i := 1; i < len(nameParts); i++ { netStats[protocol][strings.ToLower(nameParts[i])] = valueParts[i] } } return netStats, scanner.Err() }