#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

import argparse
import collections
import csv
import datetime
import git
import json
import urllib
import numpy as np
import pathlib
from redo import retry
import requests
import statistics
import tempfile

try:
    from urllib.parse import urlencode
    from urllib.request import urlopen, urlretrieve
except ImportError:
    from urllib import urlencode, urlretrieve
    from urllib2 import urlopen


RETRY_SLEEP = 10
AD_QUERY = {
    "from": "task",
    "where": {
        "and": [
            {"in": {"repo.branch.name": ["mozilla-central"]}},
            {"regex": {"run.name": ".*fenix.*"}},
            {"regex": {"run.name": ".*perftest.*"}},
        ]
    },
    "select": ["task.artifacts", "action.start_time", "task.id"],
    "limit": 100000,
}

DEFAULT_TEST_NAME = "view"


def csv_generation_parser():
    """Parser for the CSV generation script."""
    parser = argparse.ArgumentParser(
        "Run this tool to build CSVs containing Fenix data from some tasks "
        + "running with the multi-commit paradigm in mozilla-central "
        + "(must have perfherder data)."
    )
    parser.add_argument(
        "-t",
        "--test-name",
        type=str,
        default=DEFAULT_TEST_NAME,
        help="The name of the test to get data from (must exist in the task name). "
        + "Defaults to `view`. To get view data before Jul. 31, 2020, use `applink`.",
    )
    parser.add_argument(
        "-d",
        "--device",
        type=str,
        choices=["p2", "g5"],
        default="p2",
        help="Device to get data from.",
    )
    parser.add_argument(
        "-c",
        "--cache-path",
        type=str,
        default=None,
        help="Path to a cache for perfherder artifacts (so you don't re-download them). "
        + "Disabled by default.",
    )
    parser.add_argument(
        "-r",
        "--fenix-repo",
        type=str,
        required=True,
        help="Path to a local Fenix github repo.",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        default=None,
        help="Path to the output directory. Defaults to current working directory.",
    )
    parser.add_argument(
        "--try",
        action="store_true",
        dest="try_data",
        default=False,
        help="Include data from the try server.",
    )
    parser.add_argument(
        "--replicates",
        action="store_true",
        default=False,
        help="Gather the replicates instead of the medians.",
    )
    parser.add_argument(
        "--median-per-day",
        action="store_true",
        default=False,
        help="Returns a single result per day - the median - instead of per commit runs",
    )
    return parser


def query_activedata(query_json):
    """Used to run queries on active data."""
    active_data_url = "http://activedata.allizom.org/query"

    req = urllib.request.Request(active_data_url)
    req.add_header("Content-Type", "application/json")
    jsondata = json.dumps(query_json)

    jsondataasbytes = jsondata.encode("utf-8")
    req.add_header("Content-Length", len(jsondataasbytes))

    print("Querying Active-data...")
    response = urllib.request.urlopen(req, jsondataasbytes)
    print("Status:" + str(response.getcode()))

    data = json.loads(response.read().decode("utf8").replace("'", '"'))["data"]
    return data


def download_file(url, target, retry_sleep=RETRY_SLEEP, attempts=3):
    """Downloads a file, given an URL in the target path.

    The function will attempt several times on failures.
    """

    def _download_file(url, target):
        req = requests.get(url, stream=True, timeout=30)
        target_dir = target.parent.resolve()
        if str(target_dir) != "":
            target_dir.mkdir(exist_ok=True)

        with target.open("wb") as f:
            for chunk in req.iter_content(chunk_size=1024):
                if not chunk:
                    continue
                f.write(chunk)
                f.flush()
        return target

    return retry(
        _download_file,
        args=(url, target),
        attempts=attempts,
        sleeptime=retry_sleep,
        jitter=0,
    )


def build_csv(
    fenix_repo,
    test_name=DEFAULT_TEST_NAME,
    device_name="p2",
    output=None,
    cache_path=None,
    try_data=False,
    medians=False,
    median_per_day=False,
):
    """Generates a CSV file containing per-commit fenix data
    for a given test name.
    """
    if not medians and median_per_day:
        raise NotImplementedError(
            "Please specify either --replicates or --median-per-day. I didn't know\n"
            + "how these would work together so I didn't implement it."
        )

    if cache_path:
        cache_path = pathlib.Path(cache_path)
        cache_path.mkdir(parents=True, exist_ok=True)
    else:
        cache_path = tempfile.mkdtemp()

    if output:
        output = pathlib.Path(output)
        output.mkdir(parents=True, exist_ok=True)
    else:
        output = pathlib.Path(".")

    # Initialize the git directory now before the long steps below
    fenix = git.Repo(fenix_repo)

    print(f"Generating data for {test_name} on the {device_name} device...")

    ## Get the AD data
    AD_QUERY["where"]["and"].extend(
        [
            {"regex": {"run.name": ".*%s.*" % test_name}},
            {"regex": {"run.name": ".*-%s-.*" % device_name}},
        ]
    )
    if try_data:
        AD_QUERY["where"]["and"][0]["in"]["repo.branch.name"].append("try")
    data = query_activedata(AD_QUERY)

    allph = []
    for c, artifacts in enumerate(data["task.artifacts"]):
        if not artifacts:
            continue

        for artifact in artifacts:
            if not artifact:
                continue
            if not isinstance(artifact, dict):
                continue

            ph = artifact["url"].split("/")[-1]
            if "perfherder" not in ph or ph.startswith("perfherder"):
                continue

            allph.append((artifact["url"], ph, ph.split("-")[0], data["task.id"][c]))

    ## Download the perfherder data and get its commit date
    nallph = []
    for url, file, rev, taskid in allph:
        file = f"{taskid}-{test_name}-{device_name}-{file}"
        fp = pathlib.Path(cache_path, file)
        if not fp.exists():
            print(f"Downloading to {fp}")
            download_file(url, fp)
        with fp.open() as f:
            phd = json.load(f)

        # sanity checks
        if (
            "suites" not in phd
            or len(phd["suites"]) == 0
            or "value" not in phd["suites"][0]
        ):
            print("Bad data, skipping...")
            continue

        try:
            commitdate = fenix.commit(rev).committed_date
            vals = phd["suites"][0].get("subtests", [{}])[0].get("replicates", [])
            if medians:
                vals = [phd["suites"][0]["value"]]
            for val in vals:
                nallph.append((commitdate, val, rev))
        except ValueError:
            # Some commits don't exist which is really weird - I don't
            # understand how there's a build for them when we can't find them
            # in the fenix repo.
            print("Failed to find an actual commit for %s" % rev)

    # Sort the data by time
    allphs = sorted(nallph, key=lambda x: x[0])

    if median_per_day:
        allphs = transform_to_median_per_day(allphs)

    ## Store as a CSV
    csvfile_human_readable = pathlib.Path(output, f"{test_name}-{device_name}.csv")
    csvfile_raw = pathlib.Path(output, f"{test_name}-{device_name}-raw.csv")
    write_csv(csvfile_human_readable, optimize_for_human_readability(allphs))
    write_csv(csvfile_raw, allphs)
    print(
        f"Finished generation. Data contained in {str(csvfile_human_readable)} & {str(csvfile_raw)}"
    )

    try:
        from matplotlib import pyplot as plt

        plot_allphs = optimize_for_plotting(allphs)
        plt.figure()
        plt.plot_date([v[0] for v in plot_allphs], [v[1] for v in plot_allphs])
        plt.show()
    except ImportError:
        print("Skipping print stage, cannot find matplotlib")
        return


def transform_to_median_per_day(data):
    date_to_iterations = collections.defaultdict(list)
    for row in data:
        dt = datetime.datetime.fromtimestamp(row[0])
        ymd = datetime.datetime(dt.year, dt.month, dt.day)
        date_to_iterations[ymd].append(row[1])

    out_data = []
    for i, (date, times) in enumerate(date_to_iterations.items()):
        transformed_row = [date.timestamp(), statistics.median(times), "N/A"]
        out_data.append(transformed_row)
    return out_data


def write_csv(csvfile, data):
    with csvfile.open("w") as f:
        writer = csv.writer(f)
        writer.writerow(["times", "data", "revision"])
        writer.writerows(data)


def optimize_for_human_readability(data):
    def transform_row(row):
        dt = datetime.datetime.fromtimestamp(row[0])
        date_str = dt.strftime("%Y-%m-%d %H:%M:%S")
        rounded_timestamp = round(row[1])
        abbrev_commit = row[2][:9]
        return [date_str, rounded_timestamp, abbrev_commit]

    return [transform_row(list(row)) for row in data]


def optimize_for_plotting(data):
    import matplotlib.dates

    def transform_row(row):
        matplot_date = matplotlib.dates.epoch2num(row[0])
        return [matplot_date] + row[1:]

    return [transform_row(list(row)) for row in data]


if __name__ == "__main__":
    args = csv_generation_parser().parse_args()
    build_csv(
        args.fenix_repo,
        test_name=args.test_name,
        device_name=args.device,
        output=args.output,
        cache_path=args.cache_path,
        try_data=args.try_data,
        medians=not args.replicates,
        median_per_day=args.median_per_day,
    )