tools/collate-mboxes.py (61 lines of code) (raw):

#!/usr/bin/env python3 """ Simple tool for collating multiple mbox files into a single one, sorted by message ID. If the message-ID is missing, use the Date or Subject and prefix the sort key to appear last. Can optionally sort by ezmlm number. This should be less likely to have missing numbers or duplicate entries. However duplicates can occur in archive files if: - the sequence number was reset at any point - multiple mailing lists were merged - messages were somehow duplicated before archival Used for multi-import tests where you wish to check that multiple sources give the same ID Emails with duplicate sort keys are logged and dropped """ import argparse import mailbox import re import sys parser = argparse.ArgumentParser(description='Command line options.') parser.add_argument('--ezmlm', dest='ezmlm', action='store_true', help="Use ezmlm numbering for sorting") parser.add_argument('args', nargs=argparse.REMAINDER) args = parser.parse_args() outmbox = args.args[0] msgfiles = args.args[1:] # multiple input files allowed allmessages = {} noid = 0 skipped = 0 crlf = None # assume that all emails have the same EOL for msgfile in msgfiles: messages = mailbox.mbox( msgfile, None, create=False ) sortkey = None for key in messages.iterkeys(): message = messages.get(key) if args.ezmlm: from_ = message.get_from() m = re.search(r"return-(\d+)-", from_) if m: sortkey = m.group(1) else: print("Failed to find ezmlm id in %s" % from_) skipped += 1 continue else: msgid = message.get('message-id') if msgid: sortkey = msgid.strip() else: print("No message id, sorting by date or subject: ", message.get_from()) noid += 1 altid = message.get('date') or message.get('subject') sortkey = "~" + altid.strip() # try to ensure it sorts last # store the data file = messages.get_file(key, True) message_raw = b'' if crlf is None: message_raw = file.readline() crlf = (message_raw.endswith(b'\r\n')) message_raw += file.read() file.close() if sortkey in allmessages: print("Duplicate sort key: %s" % sortkey) skipped += 1 allmessages[sortkey] = message_raw nw = 0 with open(outmbox, "wb") as f: for key in sorted(allmessages.keys()): f.write(allmessages[key]) if crlf: f.write(b'\r\n') else: f.write(b'\n') nw += 1 print("Wrote %u emails to %s with CRLF %s (%u without message-id) WARN: %u skipped" % (nw, outmbox, crlf, noid, skipped))