#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import asyncio

from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
from elasticsearch.helpers import async_scan

if not __package__:
    from plugins import generators, textlib # pylint: disable=no-name-in-module
else:
    from .plugins import generators, textlib # pylint: disable=no-name-in-module

import argparse
import base64
import email.utils
import hashlib
import multiprocessing
import time
import sys
import typing
import archiver

# Increment this number whenever breaking changes happen in the migration workflow:
MIGRATION_MAGIC_NUMBER = "2"


# Max number of parallel conversions to perform before pushing. 75-ish percent of max cores.
cores = multiprocessing.cpu_count()
MAX_PARALLEL_OPS = max(min(int((cores + 1) * 0.75), cores - 1), 1)

class MultiDocProcessor:
    """MultiProcess document processor"""

    def __init__(self, old_es_url: str, new_es_url: str, target: typing.Callable, num_processes: int = 8, graceful: bool = False):
        self.processes = []
        self.queues = []
        self.target = target
        self.graceful = graceful
        self.manager = multiprocessing.Manager()
        self.lock = self.manager.Lock()
        self.processed = self.manager.Value("i", 0)
        self.processed_last_count = 0
        self.start_time = time.time()
        self.queue_pointer = 0
        self.num_processes = num_processes
        for _ in range(0, num_processes):
            q: multiprocessing.Queue = multiprocessing.Queue()
            p = multiprocessing.Process(
                target=self.start,
                args=(
                    q,
                    old_es_url,
                    new_es_url,
                ),
            )
            self.queues.append(q)
            self.processes.append(p)
            p.start()

    def feed(self, *params):
        """Feed arguments to the next available processor"""
        self.queues[self.queue_pointer].put(params)
        self.queue_pointer += 1
        self.queue_pointer %= self.num_processes

    def sighup(self):
        for queue in self.queues:
            queue.put("SIGHUP")

    def stop(self):
        for queue in self.queues:
            queue.put("DONE")
        for proc in self.processes:
            proc.join()

    def status(self, total):
        processed = self.processed.value
        if processed - self.processed_last_count >= 1000:
            self.processed_last_count = processed
            now = time.time()
            time_spent = now - self.start_time
            docs_per_second = (processed / time_spent) or 1
            time_left = (total - processed) / docs_per_second

            # stringify time left
            time_left_str = "%u seconds" % time_left
            if time_left > 60:
                time_left_str = "%u minute(s), %u second(s)" % (int(time_left / 60), time_left % 60)
            if time_left > 3600:
                time_left_str = "%u hour(s), %u minute(s), %u second(s)" % (
                    int(time_left / 3600),
                    int(time_left % 3600 / 60),
                    time_left % 60,
                )

            print(
                "Processed %u documents, %u remain. ETA: %s (at %u documents per second)"
                % (processed, (total - processed), time_left_str, docs_per_second)
            )

    def start(self, queue, old_es_url, new_es_url):
        old_es = Elasticsearch([old_es_url])
        new_es = Elasticsearch([new_es_url])
        bulk_array = []
        while True:
            params = queue.get()
            if params == "SIGHUP":  # Push stragglers
                if bulk_array:
                    bulk_push(bulk_array, new_es, self.graceful)
                    bulk_array[:] = []
            elif params == "DONE":  # Close up shop completely
                if bulk_array:
                    bulk_push(bulk_array, new_es, self.graceful)
                old_es.close()
                new_es.close()
                return
            else:
                as_list = list(params)
                as_list.insert(0, old_es)
                try:
                    ret_val = self.target(*as_list)
                except Exception: # TODO: narrow exception
                    if self.graceful:
                        print("Unexpected error:", sys.exc_info()[0])
                    else:
                        print("Unexpected error:", sys.exc_info()[0])
                        raise
                if ret_val:
                    bulk_array.extend(ret_val)
                with self.lock:
                    self.processed.value += 1
                if len(bulk_array) >= 200:
                    bulk_push(bulk_array, new_es, self.graceful)
                    bulk_array[:] = []


def bulk_push(json, es, graceful=False):
    """Pushes a bunch of objects to ES in a bulk operation"""
    js_arr = []
    for entry in json:
        bulk_op = {
            "_op_type": "index",
            "_index": entry["index"],
            "_id": entry["id"],
            "_source": entry["body"],
        }
        js_arr.append(bulk_op)
    try:
        helpers.bulk(es, js_arr)
    except helpers.errors.BulkIndexError as e:
        if graceful:
            print("Bulk index error: %s" % e)
        else:
            raise


def process_document(old_es, doc, old_dbname, dbname_source, dbname_mbox, do_dkim):
    now = time.time()
    list_id = textlib.normalize_lid(doc["_source"]["list_raw"])
    try:
        source = old_es.get(index=old_dbname, doc_type="mbox_source", id=doc["_id"])
        # If we hit a 404 on a source, we have to fake an empty document, as we don't know the source.
    except Exception: # TODO: narrow exception
        print("Source for %s was not found, faking it..." % doc["_id"])
        source = {"_source": {"source": ""}}
    source_text: str = source["_source"]["source"]
    if ":" not in source_text:  # Base64
        source_text = base64.b64decode(source_text)
    else:  # bytify
        source_text = source_text.encode("utf-8", "ignore")
    archive_as_id = doc["_id"]
    if do_dkim:
        dkim_id = generators.dkimid(None, None, list_id, None, source_text)
        old_id = doc["_id"]
        archive_as_id = dkim_id
        doc["_source"]["mid"] = dkim_id
        doc["_source"]["permalinks"] = [dkim_id, old_id]
    else:
        doc["_source"]["permalinks"] = [doc["_id"]]

    doc["_source"]["dbid"] = hashlib.sha3_256(source_text).hexdigest()

    # Add in shortened body for search aggs
    # We add +1 to know whether to use ellipsis in reports.
    doc["_source"]["body_short"] = doc["_source"]["body"][:archiver.SHORT_BODY_MAX_LEN+1]

    # Add in gravatar
    header_from = doc["_source"]["from"]
    mailaddr = email.utils.parseaddr(header_from)[1]
    ghash = hashlib.md5(mailaddr.encode("utf-8")).hexdigest()
    doc["_source"]["gravatar"] = ghash

    # Append migration details to notes field in doc
    notes = doc["_source"].get("_notes", [])
    # We want a list, not a single string
    if isinstance(notes, str):
        notes = list(notes)
    notes.append(
        "MIGRATE: Document migrated from Pony Mail to Pony Mail Foal at %u, "
        "using foal migrator v/%s" % (now, MIGRATION_MAGIC_NUMBER)
    )
    # If we re-indexed the document, make a note of that as well.
    if do_dkim:
        notes.append("REINDEX: Document re-indexed with DKIM_ID at %u, " "from %s to %s" % (now, dkim_id, old_id))
    doc["_source"]["_notes"] = notes

    # Copy to new DB
    return (
        {"index": dbname_mbox, "id": archive_as_id, "body": doc["_source"]},
        {"index": dbname_source, "id": doc["_source"]["dbid"], "body": source["_source"]},
    )


def process_attachment(_old_es, doc, dbname_attachment):
    return ({"index": dbname_attachment, "id": doc["_id"], "body": doc["_source"]},)


async def main(args):
    no_jobs = args.jobs
    graceful = args.graceful
    print("Welcome to the Apache Pony Mail -> Foal migrator.")
    print("This will copy your old database, adjust the structure, and insert the emails into your new foal database.")
    print("We will be utilizing %u cores for this operation." % no_jobs)
    print("------------------------------------")
    old_es_url = args.old_url or input("Enter the full URL (including http/https) of your old ES server: ") or "http://localhost:9200/"
    new_es_url = args.new_url or input("Enter the full URL (including http/https) of your NEW ES server: ") or "http://localhost:9200/"
    if old_es_url == new_es_url:
        print("Old and new DB should not be the same, assuming error in input and exiting!")
        return
    ols_es_async = AsyncElasticsearch([old_es_url])

    old_dbname = args.old_name or input("What is the database name for the old Pony Mail emails? [ponymail]: ") or "ponymail"
    new_dbprefix = args.new_prefix or input("What is the database prefix for the new Pony Mail emails? [ponymail]: ") or "ponymail"

    do_dkim = True
    dkim_txt = (
        input(
            "Do you wish to perform DKIM re-indexing of all emails? This will NOT preserve all old permalinks currently "
            "(y/n) [y]: "
        )
        or "y"
    )
    if dkim_txt.lower() == "n":
        do_dkim = False

    # Define index names for new ES
    dbname_mbox = new_dbprefix + "-mbox"
    dbname_source = new_dbprefix + "-source"
    dbname_attachment = new_dbprefix + "-attachment"

    # Let's get started..!
    # start_time = time.time()
    count = await ols_es_async.count(index=old_dbname, doc_type="mbox")
    no_emails = count["count"]

    print("------------------------------------")
    print("Starting migration of %u emails, this may take quite a while..." % no_emails)

    processes = MultiDocProcessor(old_es_url, new_es_url, process_document, no_jobs)

    docs_read = 0
    async for doc in async_scan(
        client=ols_es_async,
        query={"query": {"match_all": {}}},
        doc_type="mbox",
        index=old_dbname,
    ):
        docs_read += 1
        processes.feed(doc, old_dbname, dbname_source, dbname_mbox, do_dkim)
        # Don't speed too far ahead of processing...
        processed = processes.processed.value
        while docs_read - processed > 100 * no_jobs:
            await asyncio.sleep(0.01)
            processed = processes.processed.value + 0

        processes.status(no_emails)

    # There may be some docs left over to push
    processes.sighup()
    while processed < no_emails:  # Wait for all documents to have been processed.
        await asyncio.sleep(1)
        print(f"Waiting for bulk push to complete ({processed} out of {no_emails} done...)")
        processed = processes.processed.value

    processes.stop()

    # Process attachments
    # start_time = time.time()
    processes = MultiDocProcessor(old_es_url, new_es_url, process_attachment, no_jobs, graceful)
    docs_read = 0
    count = await ols_es_async.count(index=old_dbname, doc_type="attachment")
    no_att = count["count"]
    print("Transferring %u attachments..." % no_att)
    async for doc in async_scan(
        client=ols_es_async,
        query={"query": {"match_all": {}}},
        doc_type="attachment",
        index=old_dbname,
    ):
        processes.feed(doc, dbname_attachment)
        docs_read += 1

        # Don't speed ahead
        processed = processes.processed.value + 0
        while docs_read - processed > 10 * no_jobs:
            await asyncio.sleep(0.01)
            processed = processes.processed.value + 0

        processes.status(no_att)

    # There may be some docs left over to push
    processes.sighup()
    while processed < no_att:  # Wait for all attachments to have been processed.
        await asyncio.sleep(1)
        print(f"Waiting for bulk push to complete ({processed} out of {no_att} done...)")
        processed = processes.processed.value

    processes.stop()
    await ols_es_async.close()
    print("All done, enjoy!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--jobs",
        "-j",
        help="Number of concurrent processing jobs to run. Default is %u." % MAX_PARALLEL_OPS,
        type=int,
        default=MAX_PARALLEL_OPS,
    )
    parser.add_argument(
        "--graceful",
        "-g",
        help="Fail gracefully and continue if a processing error occurs",
        action='store_true'
    )
    # the default on macOS is spawn, but this fails with:
    #  ForkingPickler(file, protocol).dump(obj)
    # TypeError: cannot pickle 'weakref' object
    # Work-round: allow override of start method
    parser.add_argument(
        "--start_method",
        help="Override start method (e.g. fork on macos)",
        type=str
    )
    parser.add_argument(
        "--old_url",
        help="Provide input database URL",
        type=str
    )
    parser.add_argument(
        "--old_name",
        help="Provide input database name",
        type=str
    )
    parser.add_argument(
        "--new_url",
        help="Provide output database URL",
        type=str
    )
    parser.add_argument(
        "--new_prefix",
        help="Provide output database prefix",
        type=str
    )
    mainargs = parser.parse_args()
    if mainargs.start_method:
        multiprocessing.set_start_method(mainargs.start_method)
    asyncio.run(main(mainargs))