agent/checkospanic/checkospanic_linux.go (247 lines of code) (raw):
package checkospanic
import (
"bufio"
"bytes"
"compress/flate"
"encoding/base64"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/aliyun/aliyun_assist_client/agent/log"
"github.com/aliyun/aliyun_assist_client/agent/metrics"
"github.com/aliyun/aliyun_assist_client/agent/util/timetool"
"github.com/aliyun/aliyun_assist_client/common/fileutil"
"github.com/aliyun/aliyun_assist_client/thirdparty/sirupsen/logrus"
)
const (
kdumpPath = "/var/crash"
kdumpConfigPath = "/etc/kdump.conf"
vmcoreDmesgFile = "vmcore-dmesg.txt"
maxLinesBeforePanicInfo = 200
maxLinesAfterPanicInfo = 300
)
var (
// 127.0.0.1-2023-07-05-20:51:21 or <hostname>-2023-07-05-20:51:21
vmcorePathRegex = regexp.MustCompile(`^(?:[\w.-]+)-(\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2})$`)
rePanicInfoMatch = regexp.MustCompile(`(?:\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}[+-]\d{4})?(?:\[\s*\d+\.\d+\])?\s*([^\n]+)`)
reRIPMatch = regexp.MustCompile(`RIP.*?([\w-.]+\+0x\w+)/0x`)
reLinuxCallTraceMatch = regexp.MustCompile(`(\[[\d. ]+]\s+)*(<\w+> )*(\[(<\w+>)?] )?(\? )?([\w-.]+\+0x\w+)/0x`) // which comes from dmesg
panicMsgs = []string{
"SysRq : Crash",
"SysRq : Trigger a crash",
"SysRq : Netdump",
"general protection fault",
"double fault: ",
"divide error: ",
"stack segment: ",
"Oops: ",
"Kernel BUG at",
"kernel BUG at",
"BUG: unable to handle page fault for address",
"BUG: unable to handle kernel ",
"Unable to handle kernel paging request",
"Unable to handle kernel NULL pointer dereference",
"Kernel panic: ",
"Kernel panic - ",
//"[Hardware Error]: ",
"Bad mode in ",
}
)
func ReportLastOsPanic() {
logger := log.GetLogger().WithField("Phase", "ReportLastOsPanic")
vmcoreDmesgPath, vmcoreDir, latestTime := FindLocalVmcoreDmesg(logger)
if vmcoreDmesgPath == "" {
logger.Info("there is no vmcore file need report")
return
}
if time.Since(latestTime) > time.Hour*24 {
logger.Info("the latest vmcore file is 24 hours ago, ignore it")
return
}
if !fileutil.CheckFileIsExist(vmcoreDmesgPath) {
logger.Error("vmcore dmesg file not exist", vmcoreDmesgPath)
return
}
logger = logger.WithField("file", vmcoreDmesgPath)
var kernelPanicInfo, rip, callTrace string
rip, callTrace, kernelPanicInfo, rawContent, err := ParseVmcore(logger, vmcoreDmesgPath)
if err != nil {
logger.Error("parse vmcore dmesg file failed: ", err)
return
}
compressedContent, err := compressFlate(rawContent)
if err != nil {
tip := fmt.Sprint("compress raw content failed: ", err)
logger.Error(tip)
}
_, _, timeZone := timetool.NowWithTimezoneName()
metrics.GetLinuxGuestOSPanicEvent(
"rip", rip,
"callTrace", callTrace,
"kernelPanicInfo", kernelPanicInfo,
"crashTime", latestTime.Format("2006-01-02 15:04:05"),
"crashTimeUTC", latestTime.UTC().Format("2006-01-02 15:04:05"),
"timeZone", timeZone,
"vmcoreDir", vmcoreDir,
"rawContent", compressedContent,
).ReportEvent()
logger.Info("the latest vmcore file has reported")
}
// ParseVmcore parse fileds `Call Trace` `RIP` `Kernel Panic` from vmcore-dmesg.txt
func ParseVmcore(logger logrus.FieldLogger, vmcoreDmesgPath string) (rip, callTrace, panicInfo, rawContent string, err error) {
var (
callTraceLines []string
rawContentBeforPanicInfo []string
rawContentAfterPanicInfo []string
)
var vmcoreFile *os.File
vmcoreFile, err = os.Open(vmcoreDmesgPath)
if err != nil {
logger.Error("open vmcore dmesg file failed: ", err)
return
}
defer vmcoreFile.Close()
scanner := bufio.NewScanner(vmcoreFile)
scanner.Split(bufio.ScanLines)
panicInfo, rawContentBeforPanicInfo = parsePanicInfo(scanner, maxLinesBeforePanicInfo)
if panicInfo == "" {
err = fmt.Errorf("panic info not found")
return
}
// found panicInfo, go on to find rip and call trace
var inCallTrace bool
var done int = 2 // two items need find: callTrace and rip
for done != 0 && scanner.Scan() {
line := scanner.Text()
if len(rawContentAfterPanicInfo) < maxLinesAfterPanicInfo {
rawContentAfterPanicInfo = append(rawContentAfterPanicInfo, line)
}
if strings.Contains(strings.ToLower(line), "call trace:") {
if inCallTrace {
break
}
inCallTrace = true
continue
}
if inCallTrace {
if reLinuxCallTraceMatch.MatchString(line) {
callTraceLines = append(callTraceLines, line)
} else {
done--
}
}
if rip != "" && reRIPMatch.MatchString(line) {
rip = reRIPMatch.FindStringSubmatch(line)[1]
done--
}
}
for len(rawContentAfterPanicInfo) < maxLinesAfterPanicInfo && scanner.Scan() {
rawContentAfterPanicInfo = append(rawContentAfterPanicInfo, scanner.Text())
}
callTrace = strings.Join(callTraceLines, "\n")
rawContent = strings.Join(rawContentBeforPanicInfo, "\n")
if len(rawContentAfterPanicInfo) > 0 {
rawContent = rawContent + "\n" + strings.Join(rawContentAfterPanicInfo, "\n")
}
return
}
// FindLocalVmcoreDmesg find latest directory which stores the vmcore-dmesg.txt
func FindLocalVmcoreDmesg(logger logrus.FieldLogger) (vmcoreDmesgPath, latestDir string, latestTime time.Time) {
kdumpDirTemp := kdumpPath
// read /etc/kdump.conf to get vmcore directory, default is /var/crash
if fileutil.CheckFileIsExist(kdumpConfigPath) {
content, err := os.ReadFile(kdumpConfigPath)
if err == nil {
lines := strings.Split(string(content), "\n")
for _, line := range lines {
if strings.HasPrefix(line, "path ") {
fields := strings.Fields(line)
if len(fields) == 2 {
kdumpDirTemp = strings.TrimSpace(fields[1])
}
}
}
}
}
if !fileutil.CheckFileIsExist(kdumpDirTemp) {
logger.WithField("path", kdumpDirTemp).Warn("kdump directory not exist")
return
}
entries, err := os.ReadDir(kdumpDirTemp)
if err != nil {
logger.WithFields(logrus.Fields{
"path": kdumpDirTemp,
"err": err,
}).Error("read kdump directory failed")
return
}
for _, entry := range entries {
if entry.IsDir() && vmcorePathRegex.MatchString(entry.Name()) {
vmcoreDir := entry.Name()
items := vmcorePathRegex.FindStringSubmatch(vmcoreDir)
if len(items) != 2 {
logger.Error("unknown vmcore directory name fromation:", vmcoreDir)
} else {
vmcoreTime, err := time.ParseInLocation("2006-01-02-15:04:05", items[1], time.Local)
if err != nil {
logger.WithFields(logrus.Fields{
"name": vmcoreDir,
"err": err,
}).Error("parse time from vmcore directory name failed")
} else {
if latestDir == "" || vmcoreTime.Sub(latestTime) > 0 {
latestDir = vmcoreDir
latestTime = vmcoreTime
}
}
}
}
}
if latestDir == "" {
return
}
vmcoreDmesgPath = filepath.Join(kdumpDirTemp, latestDir, vmcoreDmesgFile)
return
}
// compress input by flate algorithm, and encode by base64
func compressFlate(input string) (string, error) {
if len(input) == 0 {
return "", nil
}
buf := new(bytes.Buffer)
flateWriter, err := flate.NewWriter(buf, flate.BestCompression)
if err != nil {
return "", err
}
defer flateWriter.Close()
flateWriter.Write([]byte(input))
flateWriter.Flush()
return base64.StdEncoding.EncodeToString(buf.Bytes()), nil
}
// Find the first panicInfo log and return the contents of the n logs before it
func parsePanicInfo(scanner *bufio.Scanner, n int) (panicInfo string, content []string) {
ringbuf := make(chan string, n)
var found bool
for !found && scanner.Scan() {
line := scanner.Text()
select {
case ringbuf <- line:
default:
<-ringbuf
ringbuf <- line
}
for _, panicMsg := range panicMsgs {
if strings.Contains(line, panicMsg) {
panicInfo = rePanicInfoMatch.FindStringSubmatch(line)[1] //获取关键信息
found = true
break
}
}
}
content = make([]string, 0, n)
var done bool
for !done {
select {
case line := <-ringbuf:
content = append(content, line)
default:
done = true
}
}
return
}