tools/copy-list.py

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Copy lists This utility can be used to: - copy a list within a database - copy a list to a new database """ import sys import time import argparse from elastic import Elastic sourceLID = None targetLID = None wildcard = None debug = False notag = False newdb = None # get config and set up default databas es = Elastic() # default database name dbname = es.getdbname() rootURL = "" parser = argparse.ArgumentParser(description='Command line options.') parser.add_argument('--source', dest='source', type=str, required=True, metavar='<list id>', help='Source list to edit') parser.add_argument('--target', dest='target', type=str, metavar='<list id>', help='(optional) new list ID') parser.add_argument('--newdb', dest='newdb', type=str, metavar='<index name>', help='(optional) new ES database name') parser.add_argument('--wildcard', dest='glob', action='store_true', help='Allow wildcards in --source') parser.add_argument('--notag', dest='notag', action='store_true', help='List IDs do not have <> in them') args = parser.parse_args() sourceLID = args.source targetLID = args.target newdb = args.newdb wildcard = args.glob notag = args.notag if not (targetLID or newdb): print("Nothing to do! No target list ID or DB name specified") parser.print_help() sys.exit(-1) sourceLID = ("%s" if notag else "<%s>") % sourceLID.replace("@", ".").strip("<>") if newdb and not targetLID: targetLID = sourceLID if targetLID: targetLID = "<%s>" % targetLID.replace("@", ".").strip("<>") if targetLID == sourceLID and not newdb: print("Nothing to do! Target same as source") parser.print_help() sys.exit(-1) print("Beginning list copy:") print(" - Source ID: %s" % sourceLID) if targetLID: print(" - Target ID: %s" % targetLID) if newdb: print(" - Target DB: %s" % newdb) if not es.indices.exists(newdb): print("Target database does not exist!") sys.exit(-1) count = 0 print("Updating docs...") then = time.time() query = { 'query': { 'bool': { 'must': [ { 'wildcard' if wildcard else 'term': { 'list_raw': sourceLID } } ] } } } js_arr = [] for page in es.scan_and_scroll(body = query): sid = page['_scroll_id'] for hit in page['hits']['hits']: doc = hit['_id'] body = es.get(doc_type = 'mbox', id = doc) srcdoc = doc # save if targetLID != sourceLID: doc = hit['_id'].replace(sourceLID,targetLID) body['_source']['mid'] = doc body['_source']['list_raw'] = targetLID body['_source']['list'] = targetLID js_arr.append({ '_op_type': 'index', '_index': newdb if newdb else dbname, '_type': 'mbox', '_id': doc, '_source': body['_source'] }) source = es.get(doc_type = 'mbox_source', id = srcdoc, ignore=404) if source['found']: js_arr.append({ '_op_type': 'index', '_index': newdb if newdb else dbname, '_type': 'mbox_source', '_id': doc, '_source': source['_source'] }) else: print("Source for %s not found, hmm..." % doc) count += 1 if (count % 50 == 0): print("Processed %u emails..." % count) es.bulk(js_arr) js_arr = [] if len(js_arr) > 0: es.bulk(js_arr) print("All done, processed %u docs in %u seconds" % (count, time.time() - then))

tools/copy-list.py (106 lines of code) (raw):