tools/mboxhdr2csv.rb

#!/usr/bin/env ruby # Analyze mbox files (downloaded by PonyAPI) for general statistics into CSV # - Per list messages per month over time (PMOT) # - Count messages group by list -> graph months as time # - Per list contentlines per lists PMOT # - Per user statistics # Count lines of text content in mail body, roughly attempting to # count just new content (not automated, not > replies) # Attempt to normalize/map email addresses to committer/member status $LOAD_PATH.unshift '/srv/whimsy/lib' require 'whimsy/asf' require 'mail' require 'csv' require 'stringio' require 'zlib' require 'json' require 'time' require 'optparse' # Various utility functions/data for mailing list analysis module MailUtils extend self MEMBER = 'member' COMMITTER = 'committer' COUNSEL = 'counsel' INVALID = '.INVALID' DATE = 'date' FROM = 'from' WHO = 'who' AVAILID = 'id' SUBJECT = 'subject' TOOLS = 'tools' MAILS = 'mails' TOOLCOUNT = 'toolcount' MAILCOUNT = 'mailcount' # Subject regexes that are non-discussion oriented # Analysis: don't bother with content lines in these messages, # because most of the content is tool-generated NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@ '<board.apache.org>' => { missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb notice: /\A\[NOTICE\]/i, report: /\A\[REPORT\]/i, resolution: /\A\[RESOLUTION\]/i, svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/}, svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt} }, '<operations.apache.org>' => { notice: /\A\[NOTICE\]/i, report: /\A\[REPORT\]/i, svn_general: %r{\Asvn commit: r/}, svn_bills: %r{\Abills: r\d{4,8} -} }, '<trademarks.apache.org>' => { report: /\A\[REPORT\]/i, svn_general: %r{\Asvn commit: r/} }, '<fundraising.apache.org>' => { report: /\A\[REPORT\]/i, svn_bills: %r{\Abills: r\d{4,8} -} } } # @see www/secretary/workbench/models/message.rb # @see https://github.com/mikel/mail/issues/39 def liberal_email_parser(addr) begin addr = Mail::Address.new(addr) rescue if addr =~ /^"([^"]*)" <(.*)>$/ addr = Mail::Address.new addr.address = $2 addr.display_name = $1 elsif addr =~ /^([^"]*) <(.*)>$/ addr = Mail::Address.new addr.address = $2 addr.display_name = $1 else raise end end return addr end # Annotate mailhash by adding :who and :committer (where known) # @param mdata Hash to evaluate and annotate # Side effect: adds :who, :committer, :id from ASF::Person.find_by_email # :committer = 'n' if not found; 'N' if error, 'counsel' for special case def find_who_from(mdata) # Remove bogus INVALID before doing lookups from = mdata[:from].sub(INVALID, '') # Micro-optimize unique names case from when /Mark.Radcliffe/i mdata[:who] = 'Mark.Radcliffe' mdata[:committer] = COUNSEL mdata[:id] = 'markfradcliffe' when /mattmann/i mdata[:who] = 'Chris Mattmann' mdata[:committer] = MEMBER mdata[:id] = 'mattmann' when /jagielski/i mdata[:who] = 'Jim Jagielski' mdata[:committer] = MEMBER mdata[:id] = 'jim' when /delacretaz/i mdata[:who] = 'Bertrand Delacretaz' mdata[:committer] = MEMBER mdata[:id] = 'bdelacretaz' when /curcuru/i mdata[:who] = 'Shane Curcuru' mdata[:committer] = MEMBER mdata[:id] = 'curcuru' when /steitz/i mdata[:who] = 'Phil Steitz' mdata[:committer] = MEMBER mdata[:id] = 'psteitz' when /gardler/i # Effectively unique (see: Heidi) mdata[:who] = 'Ross Gardler' mdata[:committer] = MEMBER mdata[:id] = 'rgardler' when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail mdata[:who] = 'Craig L Russell' mdata[:committer] = MEMBER mdata[:id] = 'clr' when /McGrail/i mdata[:who] = 'Kevin A. McGrail' mdata[:committer] = MEMBER mdata[:id] = 'kmcgrail' when /sallykhudairi@yahoo/i mdata[:who] = 'Sally Khudairi' mdata[:committer] = MEMBER mdata[:id] = 'sk' when /sk@haloworldwide.com/i mdata[:who] = 'Sally Khudairi' mdata[:committer] = MEMBER mdata[:id] = 'sk' else begin # TODO use Real Name (JIRA) to attempt to lookup some notifications tmp = liberal_email_parser(from) person = ASF::Person.find_by_email(tmp.address.dup) if person mdata[:who] = person.cn mdata[:id] = person.id if person.asf_member? mdata[:committer] = MEMBER else mdata[:committer] = COMMITTER end else mdata[:who] = "#{tmp.display_name} <#{tmp.address}>" mdata[:committer] = 'n' mdata[:id] = 'unknown' end rescue mdata[:who] = mdata[:from] # Use original value here mdata[:committer] = 'N' mdata[:id] = 'unknown' end end end # Get {MAILS: [{date, who, subject, flag},...\, TOOLS: [{...},...] } from the specified list for a month # May cache data in mailroot/yearmonth.json # Returns empty hash if error or if can't find month def get_mails_month(mailroot:, yearmonth:, nondiscuss:) # Return cached calculated data if present cache_json = File.join(mailroot, "#{yearmonth}.json") if File.file?(cache_json) begin return JSON.parse(File.read(cache_json)) rescue StandardError => _e # No-op: fall through to attempt to re-create cache end end emails = {} files = Dir[File.join(mailroot, yearmonth, '*')] return emails if files.empty? emails[MAILS] = [] emails[TOOLS] = [] files.each do |email| next if email.end_with? '/index' message = IO.read(email, mode: 'rb') data = {} data[DATE] = Time.parse(message[/^Date: (.*)/, 1]).iso8601 data[FROM] = message[/^From: (.*)/, 1] # Originally (before 2265343) the local method #find_who_from expected an email address and returned who, committer # Emulate this with the version from MailUtils which expects and updates a hash temp = {from: data[FROM]} # pass a hash MailUtils.find_who_from(temp) # update the hash # pick out the bits we want data[WHO], data[COMMITTER], data[AVAILID] = temp[:who], temp[:committer], temp[:id] data[SUBJECT] = message[/^Subject: (.*)/, 1] if nondiscuss nondiscuss.each do |typ, rx| if data[SUBJECT] =~ rx data[TOOLS] = typ break # regex.each end end end data.has_key?(TOOLS) ? emails[TOOLS] << data : emails[MAILS] << data end # Provide as sorted data for ease of use emails[TOOLS].sort_by! { |email| email[DATE] } emails[TOOLCOUNT] = Hash.new {|h, k| h[k] = 0 } emails[TOOLS].each do |mail| emails[TOOLCOUNT][mail[TOOLS]] += 1 end emails[TOOLCOUNT] = emails[TOOLCOUNT].sort_by { |_k, v| -v}.to_h emails[MAILS].sort_by! { |email| email[DATE] } emails[MAILCOUNT] = Hash.new {|h, k| h[k] = 0 } emails[MAILS].each do |mail| emails[MAILCOUNT]["#{mail[WHO]} (#{mail[AVAILID]})"] += 1 end emails[MAILCOUNT] = emails[MAILCOUNT].sort_by { |_k, v| -v}.to_h # If yearmonth is before current month, then write out yearmonth.json as cache if yearmonth < Date.today.strftime('%Y%m') begin File.open(cache_json, 'w') do |f| f.puts JSON.pretty_generate(emails) end rescue # No-op, just don't cache for now end end return emails end end module MboxUtils extend self MBOX_EXT = '.mbox' VERSION = 'mboxhdr2json' URIRX = URI.regexp(['http', 'https']) # Read a ponyapi.rb mbox file and return mails (text content only) # @param f path to .mbox or .mbox.gz # @return [mail1, mail2, ...] def read_mbox(f) if f.end_with? '.gz' stream = StringIO.new(mbox) reader = Zlib::GzipReader.new(stream) mbox = reader.read reader.close stream.close rescue nil else mbox = File.read(f) end mbox.force_encoding Encoding::ASCII_8BIT messages = mbox.split(/^From .*/) messages.shift # Drop first item (not a message) return messages end # Process an mbox file into mailhash of selected headers and lines of text # @param f path to .mbox or .mbox.gz # @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...] # @return nil, [read, errors2...] if mbox file can't be read # mailhash contains :from, :subject, :listid, :date, :messageid, # :inreplyto, :lines (count), plus :who and :committer def mbox2stats(f) begin mails = read_mbox(f) rescue => e return nil, e end errs = [] messages = [] order = 0 mails.each do |message| mdata = {} mail = nil begin # Preserve message order in case it's important order += 1 # Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script) mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n")) mdata[:order] = order begin # HACK for cases where some values don't parse, try to get good enough values in rescue mdata[:from] = mail[:from].value mdata[:subject] = mail[:subject].value mdata[:listid] = mail[:List_Id].value mdata[:date] = mail.date.to_s rescue => _e mdata[:from] = mail[:from] mdata[:subject] = mail[:subject] mdata[:listid] = mail[:List_Id] mdata[:date] = mail.date.to_s mdata[:parseerr] = mail.errors end mdata[:messageid] = mail.message_id mdata[:inreplyto] = mail.in_reply_to if mail.multipart? text_part = mail.text_part.decoded.split(/\r?\n/) else text_part = mail.body.decoded.split(/\r?\n/) end ctr = 0 # Count text lines of nonblank, nonreply content links = 0 # Count number of apparent hyperlinks text_part.each do |l| case l when /\A\s*>/ # Don't count reply lines, even when indented when /\A\s*\z/ # Don't count blank lines when /\AOn.*wrote:\z/ # Don't count most common reply header when /\A-----Original Message-----/ # Stop counting if it seems like a forwarded message break # TODO: figure out if we're in a .sig block, and stop counting else links += 1 if l =~ URIRX ctr += 1 end end mdata[:lines] = ctr mdata[:links] = links # Annotate various other precomputable data MailUtils.find_who_from(mdata) begin d = Time.parse(mdata[:date]) mdata[:y] = d.year mdata[:m] = d.month mdata[:d] = d.day mdata[:w] = d.wday mdata[:h] = d.hour mdata[:z] = d.zone rescue => _e # no-op - not critical puts "DEBUG: #{e.message} parsing: #{mdata[:date]}" end regex = MailUtils::NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any) if regex regex.each do |typ, rx| if mdata[:subject] =~ rx mdata[:nondiscuss] = typ break # regex.each end end end # Push our hash messages << mdata rescue => e errs << [e, mdata[:order]] end end return messages, errs end # Scan dir tree for mboxes and output individual mailhash as JSONs # @param dir to scan (whole tree) # @param ext file extension to glob for # Side effect: writes out f.chomp(ext).json files # @note writes string VERSION for differentiating from other *.json def scan_dir_mbox2stats(dir, ext = MBOX_EXT) Dir[File.join(dir, '**', "*#{ext}")].sort.each do |f| mails, errs = mbox2stats(f) File.open("#{f.chomp(ext)}.json", 'w') do |fout| fout.puts JSON.pretty_generate([VERSION, mails, errs]) end end end # Scan dir tree for mailhash JSONs and output an overview CSV of all # @return [ error1, error2, ...] if any errors # Side effect: writes out dir/outname CSV file # @note reads string VERSION for differentiating from other *.json def scan_dir_stats2csv(dir, outname, ext = '.json') errors = [] jzons = [] Dir[File.join(dir, '**', "*#{ext}")].sort.each do |f| begin tmp = JSON.parse(File.read(f)) if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION) jzons << tmp.drop(1) end rescue => e puts "ERROR: parse of #{f} raised #{e.message[0..255]}" errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}" next end end raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0 puts "#{__method__} processing #{jzons.length} mbox json files" # Write out headers and the first array in new csv csvfile = File.join(dir, outname) csv = CSV.open(csvfile, 'w', headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true) jzons.shift[0].each do |m| csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto'] ] end # Write out all remaining arrays, without headers, appending jzons.each do |j| begin j[0].each do |m| csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto'] ] end rescue => e puts "ERROR: write of #{f} raised #{e.message[0..255]}" errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}" next end end csv.close # Just in case return errors end end # ## ### #### ##### ###### # Check options and call needed methods DEFAULT_OUTPUT = 'mbox-analysis.csv' def optparse options = {} OptionParser.new do |opts| opts.on('-h') { puts opts; exit } opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to read existing mboxes and dump output in (default: .)') do |d| if File.directory?(d) options[:dir] = d else raise ArgumentError, "-d #{d} is not a valid directory" end end opts.on('-oOUTPUT.CSV', '--output OUTPUT.CSV', "Filename to output rows into; default #{DEFAULT_OUTPUT}") do |o| options[:output] = o end opts.on('-j', '--json', 'Process .mbox to .json (optional)') do options[:json] = true end begin opts.parse! options[:dir] = '.' if options[:dir].nil? options[:output] = DEFAULT_OUTPUT if options[:output].nil? rescue StandardError => e $stderr.puts "#{e.message}; try -h for valid options, or see code" exit 1 end end return options end # ## ### #### ##### ###### # Main method for command line use if __FILE__ == $PROGRAM_NAME options = optparse if options[:json] puts "START: Parsing #{options[:dir]}/*#{MboxUtils::MBOX_EXT} into *.json" MboxUtils.scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files end puts "START: Analyzing #{options[:dir]}/*.json into #{options[:output]}" errs = MboxUtils.scan_dir_stats2csv(options[:dir], options[:output]) if errs errs.each do |e| puts "ERROR: #{e}" end end puts 'END' end

tools/mboxhdr2csv.rb (382 lines of code) (raw):