tools/edit-list.py

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Modify lists and messages This utility can be used to: - rename a list - make a list private - make a list public - update the description for a list - delete mails from a list (does not delete mbox_source entries) - obfuscate some fields (from, subject, body) in an mbox entry (does not obfuscate the raw source document) """ import sys import time import argparse from elastic import Elastic class options: def __init__(self): parser = argparse.ArgumentParser(description='Command line options.') # Cannot have both source and mid as input source_group = parser.add_mutually_exclusive_group() source_group.add_argument('--source', dest='source', type=str, help='Source list to edit') source_group.add_argument('--mid', dest='mid', type=str, help='Source Message-ID to edit') parser.add_argument('--rename', dest='target', type=str, help='(optional) new list ID') parser.add_argument('--desc', dest='desc', type=str, help='(optional) new list description') parser.add_argument('--obfuscate', dest='obfuscate', type=str, help='Things to obfuscate in body, if any') # private and public are mutually exclusive privacy_group = parser.add_mutually_exclusive_group() privacy_group.add_argument('--private', dest='private', action='store_true', help='Make all emails in list private') privacy_group.add_argument('--public', dest='public', action='store_true', help='Make all emails in list public') parser.add_argument('--delete', dest='delete', action='store_true', help='Delete emails from this list') parser.add_argument('--wildcard', dest='glob', action='store_true', help='Allow wildcards in --source') parser.add_argument('--debug', dest='debug', action='store_true', help='Debug output - very noisy!') parser.add_argument('--notag', dest='notag', action='store_true', help='List IDs do not have <> in them') parser.add_argument('--test', dest='test', action='store_true', help='Only test for occurrences, do not run the chosen action (dry run)') args = parser.parse_args() self.sourceLID = args.source self.targetLID = args.target self.desc = args.desc self.makePrivate = args.private self.makePublic = args.public self.deleteEmails = args.delete self.wildcard = args.glob self.debug = args.debug self.notag = args.notag self.mid = args.mid self.obfuscate = args.obfuscate self.dryrun = args.test self.privacyChange = self.makePrivate or self.makePublic self.otherChange = self.targetLID or self.desc or self.obfuscate self.anyChange = self.privacyChange or self.otherChange if not self.sourceLID and not self.mid: print("No source list ID specified!") parser.print_help() sys.exit(-1) if not (self.anyChange or self.deleteEmails): print("Nothing to do! No target list ID or action specified") parser.print_help() sys.exit(-1) if self.desc and not self.sourceLID: print("No source list ID specified for description!") parser.print_help() sys.exit(-1) if self.anyChange and self.deleteEmails: print("Cannot both change and delete emails in the same run") parser.print_help() sys.exit(-1) # TODO does it make sense to allow --rename with --mid? # i.e. rename the list for a single mid? if self.sourceLID: self.sourceLID = ("%s" if self.notag else "<%s>") % self.sourceLID.replace("@", ".").strip("<>") if self.targetLID: self.targetLID = "<%s>" % self.targetLID.replace("@", ".").strip("<>") def process_hits(page, args, dbname): """ Processes each hit in a scroll search and proposes changes in the array returned """ changes = [] if 'hits' in page and 'hits' in page['hits']: for hit in page['hits']['hits']: doc = hit['_id'] body = {} if args.obfuscate: body['body'] = hit['_source']['body'].replace(args.obfuscate, "...") body['subject'] = hit['_source']['subject'].replace(args.obfuscate, "...") body['from'] = hit['_source']['from'].replace(args.obfuscate, "...") if args.targetLID: body['list_raw'] = args.targetLID body['list'] = args.targetLID if args.makePrivate: body['private'] = True if args.makePublic: body['private'] = False if not args.dryrun: changes.append({ '_op_type': 'delete' if args.deleteEmails else 'update', '_index': dbname, '_type': 'mbox', '_id': doc, 'doc': body }) else: changes.append({}) # Empty action for counting if dryrun, so we never accidentally run it. return changes def main(): es = Elastic() dbname = es.getdbname() # get config and set up default databas es = Elastic() # default database name dbname = es.getdbname() args = options() print("Beginning list edit:") if args.sourceLID: print(" - List ID: %s" % args.sourceLID) else: print(" - MID: %s" % args.mid) if args.targetLID: print(" - Target ID: %s" % args.targetLID) if args.makePublic: print(" - Action: Mark all emails public") if args.makePrivate: print(" - Action: Mark all emails private") if args.deleteEmails: print(" - Action: Delete emails (sources will be kept!)") if args.obfuscate: print(" - Action: Obfuscate parts of email containing: %s" % args.obfuscate) if args.desc: print(" - Action: add description: %s" % args.desc) if args.dryrun: print("DRY RUN - NO CHANGES WILL BE MADE") else: LID = args.sourceLID if args.targetLID: LID = args.targetLID es.index( doc_type="mailinglists", id=LID, body = { 'list': LID, 'name': LID, 'description':args.desc } ) print("All done, updated description.") if args.targetLID or args.makePrivate or args.makePublic or args.deleteEmails or args.mid or args.obfuscate: if args.dryrun: print("DRY RUN - NO CHANGES WILL BE MADE") print("Updating docs...") then = time.time() terms = { 'wildcard' if args.wildcard else 'term': { 'list_raw': args.sourceLID } } if args.mid: terms = { 'term': { 'mid': args.mid } } query = { '_source': ['body', 'subject', 'from'] if args.obfuscate else False, 'query': { 'bool': { 'must': [ terms ] } } } proposed_changes = [] for page in es.scan_and_scroll(body = query): prop = process_hits(page, args, dbname) if prop: proposed_changes.extend(prop) tmp = [] count = len(proposed_changes) processed = 0 # Handle proposed changes in batches of 500 while len(proposed_changes) > 0: tmp.append(proposed_changes.pop(0)) if len(tmp) >= 500: if not args.dryrun: es.bulk(tmp) processed += len(tmp) tmp = [] print("Processed %u documents..." % processed) # Any stragglers remaining gets processed here if len(tmp) > 0: if not args.dryrun: es.bulk(tmp) processed += len(tmp) print("Processed %u documents..." % processed) print("All done, processed %u docs in %u seconds" % (count, time.time() - then)) if __name__ == '__main__': main()

tools/edit-list.py (186 lines of code) (raw):