hgext/serverlog/scripts/parse.py (198 lines of code) (raw):

#!/usr/bin/env python # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # This script parses serverlog events into unified messages. import datetime import optparse import sys hgweb_paths = [ ("/json-pushes", "hgweb-json-pushes"), ("/raw-rev", "hgweb-raw-rev"), ("/rev/", "hgweb-rev"), ("/raw-file/", "hgweb-raw-file"), ("/atom-log", "hgweb-atom-log"), ("/static/", "hgweb-static"), ("/json-info", "hgweb-json-info"), ("/archive", "hgweb-archive"), ("/info/refs", "git"), ("/diff/", "hgweb-diff"), ("/annotate/", "hgweb-annotate"), ("/shortlog", "hgweb-shortlog"), ("/pushlog", "hgweb-pushlog"), ("/pushloghtml", "hgweb-pushloghtml"), ("/log/", "hgweb-log"), ("/file/", "hgweb-file"), ("/comparison/", "hgweb-comparison"), ("/filelog/", "hgweb-filelog"), ] def normalize_path(path): for search, command in hgweb_paths: try: offset = path.index(search) return path[0:offset], command except ValueError: pass return path, None class Request(object): def __init__(self, date, repo, ip, url): # The repository path that comes in is relative to the hgweb.wsgi file # for that repository. And, since some repositories in different # directories share the same base name, that means we have to consult # the URL to derive the full repo path. Joy. # Start by stripping the query string. path = url if "?" in path: path = path[0 : path.find("?")] path = path.strip("/") path, command = normalize_path(path) self.start_date = date self.repo = path self.ip = ip self.url = url self.command = command self.write_count = None self.wall_time = None self.cpu_time = None self.end_date = None class SSHSession(object): """Represents a full SSH session.""" def __init__(self, sid, date, repo, username): self.id = sid self.start_date = date self.repo = repo self.username = username self.commands = [] self.current_command = None self.wall_time = None self.cpu_time = None self.end_date = None @property def session_type(self): """Determine the type of the session. The following types exist: push - New changesets were pushed pushkey - New keys were set but no changesets were pushed pull - Changesets were retrieved lookup - Simple repository lookup. Most likely a no-op push or pull. """ commands = set([t[0] for t in self.commands]) if "unbundle" in commands: return "push" if "pushkey" in commands: return "pushkey" if "getbundle" in commands: return "pull" return "lookup" def parse_events(fh, onlydate=None): thisyear = datetime.date.today().year requests = {} sessions = {} for line in fh: date = None host = None if " hgweb: " in line: # Legacy logs: Apr 14 20:17:43 # New logs: 2016-04-14T20:17:44.250678+00:00 if line.startswith("20"): assert line[26:32] == "+00:00" date = datetime.datetime.strptime(line[0:26], "%Y-%m-%dT%H:%M:%S.%f") hostaction, line = line[33:].split(":", 1) host, action = hostaction.split() else: date = line[0:15] if date[4] == " ": date = date[0:4] + "0" + date[5:] # Year isn't in the logs and Python defaults to 1900. # This can cause a problem during leap years because strptime # will raise a "ValueError: day is out of range for month" for # Feb 29. So we add the year to the string before parsing. date = "%d %s" % (thisyear, date) date = datetime.datetime.strptime(date, "%Y %b %d %H:%M:%S") hostaction, line = line[16:].split(":", 1) host, action = hostaction.split() line = line.strip() parts = line.rstrip().split() ids, action = parts[0:2] ids = ids.split(":") if len(ids) > 1: session = ids[0] request = ids[1] else: session = None request = ids[0] if action == "BEGIN_REQUEST": if onlydate and date.date() != onlydate: continue repo, ip, url = parts[2:] requests[request] = Request(date, repo, ip, url) elif action == "BEGIN_PROTOCOL": command = parts[2] r = requests.get(request) if r: if command != "None": r.command = command elif action == "END_REQUEST": r = requests.get(request) if not r: continue wr_count, t_wall, t_cpu = parts[2:] wr_count = int(wr_count) t_wall = float(t_wall) t_cpu = float(t_cpu) r.write_count = wr_count r.wall_time = t_wall r.cpu_time = t_cpu r.end_date = date del requests[request] yield r elif action == "BEGIN_SSH_SESSION": if onlydate and date.date() != onlydate: continue repo, username = parts[2:] assert session sessions[session] = SSHSession(session, date, repo, username) elif action == "END_SSH_SESSION": s = sessions.get(session) if not s: continue t_wall, t_cpu = parts[2:] t_wall = float(t_wall) t_cpu = float(t_cpu) s.end_date = date s.wall_time = t_wall s.cpu_time = t_cpu del sessions[session] yield s elif action == "BEGIN_SSH_COMMAND": command = parts[2] s = sessions.get(session) if s: s.current_command = (request, command, date) elif action == "END_SSH_COMMAND": s = sessions.get(session) if not s: continue t_wall, t_cpu = parts[2:] t_wall = float(t_wall) t_cpu = float(t_cpu) current = s.current_command if current and current[0] == request: s.commands.append((current[1], current[2], date, t_wall, t_cpu)) # The request IDs don't match or there is no beginning event. # This is weird. We just don't record the command. s.current_command = None elif action == "CHANGEGROUPSUBSET_START": source, count = parts[2:] # count = int(count) elif action == "WRITE_PROGRESS": count = parts[2] # count = int(count) def print_stream(fh, onlydate=None): for thing in parse_events(fh, onlydate=onlydate): if isinstance(thing, Request): r = thing if r.start_date: d = r.start_date.isoformat() else: d = "None" print( "%s %s %s %s %s %s %s" % (d, r.repo, r.ip, r.command, r.write_count, r.wall_time, r.cpu_time) ) elif isinstance(thing, SSHSession): s = thing if s.start_date: d = s.start_date.isoformat() else: d = "None" print( "%s %s %s %s %s %s %s" % (d, s.id, s.repo, s.session_type, s.username, s.wall_time, s.cpu_time) ) if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("--date", help="Only extract events on the specified date") options, args = parser.parse_args() date = None if options.date: dt = datetime.datetime.strptime(options.date, "%Y-%m-%d") date = dt.date() if args: for f in args: with open(f, "rb") as fh: print_stream(fh, onlydate=date) else: print_stream(sys.stdin, onlydate=date)