perfrunbook/utilities/sar_parse.py (182 lines of code) (raw):
#!/usr/bin/env python3
import re
import sys
import numpy as np
import pandas as pd
def parse_start_date(line):
# Get start date from line in ISO 8601 format
hdr = re.compile(r'''.*?\)\s+(?P<start>\d+\-\d+\-\d+).*''')
match_hdr = hdr.match(line)
start_date = None
if match_hdr:
start_date = match_hdr['start']
return start_date
class ParseInterface(object):
def __init__(self, start_date):
self.regex_hdr = None
self.regex_data = None
self.regex_footer = re.compile(r'''Average:.*''')
self.start = start_date
self.last_date = None
self.parquet_name = "sar.parquet"
self.fields = []
# Pass in a dict s:
# { date: string in YYYY-MM-DD
# time: string in hh:mm:ss
# }
# Pass in a last_date as an np.datetime64 obj or None
# Returns a np.datetime64 object
def parse_time(self, s, last_date):
d = np.datetime64("{} {}".format(s['date'], s['time']))
if last_date:
while (d - last_date) < np.timedelta64(0, 's'):
d = d + np.timedelta64(1, 'D')
return d
def parse_data(self, f, save_parquet=True):
line = f.readline()
data = {}
for key in self.fields:
data[key[0]] = []
while(line):
match_data = self.regex_data.match(line)
if match_data:
s = {'date': self.start,
'time': match_data['time']}
d = self.parse_time(s, self.last_date)
data['time'].append(d)
self.last_date = d
# Every other field is not special
for key in self.fields[1:]:
data[key[0]].append(key[1](match_data[key[0]]))
line = f.readline()
continue
match_footer = self.regex_footer.match(line)
if match_footer:
break
line = f.readline()
df = pd.DataFrame(data)
df = df.set_index('time')
if (save_parquet):
df.to_parquet(self.parquet_name, compression='gzip')
return df
# Look for the header, if we find it, read until we hit the end of the section
# Return the data frame if we get one.
def parse_for_header(self, line, f, save_parquet=True):
match = self.regex_hdr.match(line)
if match:
return self.parse_data(f, save_parquet)
return None
class ParseIfaceUtil(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+IFACE\s+'''
r'''rxpck/s\s+txpck/s\s+rxkB/s\s+txkB/s\s+rxcmp/s\s+txcmp/s\s+'''
r'''rxmcst/s''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<iface>[\d\w]+)\s+'''
r'''(?P<rxpcks>\d+\.\d+)\s+(?P<txpcks>\d+\.\d+)\s+(?P<rxkBs>\d+\.\d+)\s+'''
r'''(?P<txkBs>\d+\.\d+)\s+(?P<rxcmps>\d+\.\d+)\s+(?P<txcmps>\d+\.\d+)\s+'''
r'''(?P<rxmcsts>\d+\.\d+)''')
self.fields = [('time', None), ('iface', str), ('rxpcks', float), ('txpcks', float), ('rxkBs', float),
('txkBs', float), ('rxcmps', float), ('txcmps', float), ('rxmcsts', float)]
self.start = start_date
self.last_date = None
if parquet:
self.parquet_name = "sar_iface_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_iface.parquet"
# class that embodies the state machine for parsing SAR log for device utilization
class ParseDevUtil(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+DEV\s+tps\s+'''
r'''rd_sec/s\s+wr_sec/s\s+avgrq\-sz\s+avgqu\-sz\s+'''
r'''await\s+svctm\s+%util''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<dev>[\w\d\-]+)\s+(?P<tps>\d+\.\d+)\s+'''
r'''(?P<rdsecs>\d+\.\d+)\s+(?P<wrsecs>\d+\.\d+)\s+(?P<avgrqsz>\d+\.\d+)\s+'''
r'''(?P<avgqusz>\d+\.\d+)\s+(?P<await>\d+\.\d+)\s+(?P<svctm>\d+\.\d+)\s+(?P<util>\d+\.\d+)''')
self.fields = [('time', None), ('dev', str), ('tps', float), ('rdsecs', float),
('wrsecs', float), ('avgrqsz', float), ('avgqusz', float), ('await', float),
('svctm', float), ('util', float)]
self.start = start_date
self.last_date = None
if parquet:
self.parquet_name = "sar_dev_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_dev.parquet"
# class that embodies the state machine for parsing SAR log for disk reads/writes
class ParseDiskUtil(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+tps\s+rtps\s+'''
r'''wtps\s+bread/s\s+bwrtn/s''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<tps>\d+\.\d+)\s+'''
r'''(?P<rtps>\d+\.\d+)\s+(?P<wtps>\d+\.\d+)\s+'''
r'''(?P<breads>\d+\.\d+)\s+(?P<bwrtns>\d+\.\d+)''')
self.fields = [('time', None), ('tps', float), ('rtps', float),
('wtps', float), ('breads', float), ('bwrtns', float)]
self.start = start_date
self.last_date = None
if parquet:
self.parquet_name = "sar_disk_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_disk.parquet"
# class that embodies the state machine for parsing SAR logs for cpu utilization
class ParseTcpTime(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+active/s\s+'''
r'''passive/s\s+iseg/s\s+oseg/s''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<active>\d+\.\d+)\s+'''
r'''(?P<passive>\d+\.\d+)\s+(?P<iseg>\d+\.\d+)\s+'''
r'''(?P<oseg>\d+\.\d+)''')
self.fields = [('time', None), ('active', str), ('passive', float),
('iseg', float), ('oseg', float)]
self.start = start_date
self.last_date = None
if parquet:
self.parquet_name = "sar_tcp_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_tcp.parquet"
# class that embodies the state machine for parsing SAR logs for cpu utilization
class ParseCpuTime(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+CPU\s+'''
r'''%usr\s+%nice\s+%sys\s+%iowait\s+%steal\s+%irq\s+%soft\s+%guest\s+%gnice\s+%idle''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<cpu>[\d\w]+)\s+'''
r'''(?P<usr>\d+\.\d+)\s+(?P<nice>\d+\.\d+)\s+(?P<sys>\d+'''
r'''\.\d+)\s+(?P<iowait>\d+\.\d+)\s+(?P<steal>\d+\.\d+)'''
r'''\s+(?P<irq>\d+\.\d+)\s+(?P<soft>\d+\.\d+)\s+'''
r'''(?P<guest>\d+\.\d+)\s+(?P<gnice>\d+\.\d+)\s+(?P<idle>\d+\.\d+)''')
self.fields = [('time', None), ('cpu', str), ('usr', float),
('nice', float), ('sys', float), ('iowait', float),
('steal', float), ('irq', float), ('soft', float),
('guest', float), ('gnice', float), ('idle', float)]
self.start = start_date
self.last_date = None
if parquet:
self.parquet_name = "sar_cpu_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_cpu.parquet"
class ParseCSwitchTime(ParseInterface):
def __init__(self, start_date, parquet=None):
super().__init__(start_date)
self.regex_hdr = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+proc/s\s+cswch/s''')
self.regex_data = re.compile(r'''(?P<time>\d+:\d+:\d+)\s+(?P<proc_s>\d+\.\d+)\s+(?P<cswch_s>\d+\.\d+)''')
self.start = start_date
self.fields = [('time', None), ('proc_s', float), ('cswch_s', float)]
self.last_date = None
if parquet:
self.parquet_name = "sar_cswch_{}.parquet".format(parquet)
else:
self.parquet_name = "sar_cswch.parquet"
def parse_sysstat(file_name, suffix=None):
with open(file_name, 'r') as f:
# Get start date
line = f.readline()
start_date = parse_start_date(line)
if not start_date:
print("ERR: header not first line of Sar file, exiting")
return 1
# Initialize parsers
parseCPU = ParseCpuTime(start_date, parquet=suffix)
parseDisk = ParseDiskUtil(start_date, parquet=suffix)
parseDev = ParseDevUtil(start_date, parquet=suffix)
parseIface = ParseIfaceUtil(start_date, parquet=suffix)
parseTcpTime = ParseTcpTime(start_date, parquet=suffix)
parseCswitch = ParseCSwitchTime(start_date, parquet=suffix)
line = f.readline()
while (line):
parseCPU.parse_for_header(line, f)
parseDisk.parse_for_header(line, f)
parseDev.parse_for_header(line, f)
parseIface.parse_for_header(line, f)
parseTcpTime.parse_for_header(line, f)
parseCswitch.parse_for_header(line, f)
line = f.readline()
return 0
if __name__ == "__main__":
exit(parse_sysstat(sys.argv[1]))