#!/usr/bin/env python3

import argparse
import base64
import datetime
import hashlib
import hmac
import json
import logging
import logging.handlers
import os
import re
import shutil
import subprocess
import sys
import time
import requests
from urllib.parse import urlparse

# User accounts running this script:
# _azbatch  - user account running scripts during Azure ML Compute Instance creation - can sudo
# azureuser - user account running scripts after Azure ML Compute Instance creation - can sudo on Compute Instances created with rootAccess = true (default)
# root      - does not need introduction...

_logger = logging.getLogger("amlsecscan")
_computer = os.environ["CI_NAME"]
_azure_ml_resource_id = (
    "/" + urlparse(os.environ["MLFLOW_TRACKING_URI"]).path.split("/", 3)[3]
)  # Get the ARM Resource ID of the Azure ML Workspace we are running on

# Configuration priority: 1) command-line parameters, 2) local config file, 3) global config file
_config_folder_path = "/home/azureuser/.amlsecscan"
_global_config_path = _config_folder_path + "/config.json"
_local_config_path = os.path.abspath(os.path.splitext(__file__)[0] + ".json")


# Replacement for azure.identity.DefaultAzureCredential().get_token since azure.identity is not available in the conda base environment and does not handle Azure ML's MSI
def _get_access_token(resource):
    # Ensure the MSI environment variables are set (by default, they are set in shells when running in AML Studio Terminal but not when running in CRON)
    if "MSI_ENDPOINT" not in os.environ or "MSI_SECRET" not in os.environ:
        env_var = _get_auth_environment_variables()
        os.environ["MSI_ENDPOINT"] = env_var["MSI_ENDPOINT"]
        os.environ["MSI_SECRET"] = env_var["MSI_SECRET"]

    url = f"{os.environ['MSI_ENDPOINT']}?resource={resource}&api-version=2017-09-01"
    client_id = os.environ.get("DEFAULT_IDENTITY_CLIENT_ID", None)
    if (
        client_id is not None
        and re.match(
            "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
            client_id,
            re.IGNORECASE,
        )
        is not None
    ):
        url = f"{url}&clientid={client_id}"
    resp = requests.get(url, headers={"Secret": os.environ["MSI_SECRET"]})
    resp.raise_for_status()
    return resp.json()["access_token"]


def _run(command, check=True):
    # To be compatible with Python 3.6 (default python for root user), 'text' and 'capture_output' cannot be used
    try:
        return subprocess.run(
            command,
            shell=True,
            check=check,
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
    except subprocess.CalledProcessError as e:
        _logger.exception(
            f"Error: {e}\n    stdout:\n{e.stdout}\n    stderr:\n{e.stderr}"
        )
        raise


class StdOutTelemetry:
    def send(self, log_type, data):
        print(json.dumps({"table": log_type, "rows": data}))


class LogAnalyticsTelemetry:
    def __init__(self, log_analytics_resource_id):

        # Get the ARM Resource ID of the Log Analytics Workspace
        if log_analytics_resource_id is None:
            config_path = (
                _local_config_path
                if os.path.exists(_local_config_path)
                else _global_config_path
                if os.path.exists(_global_config_path)
                else None
            )
            if config_path is not None:
                _logger.debug(f"Loading configuration from {config_path}")
                with open(config_path, "rt") as file:
                    config = json.load(file)
                log_analytics_resource_id = config["logAnalyticsResourceId"]
        self.log_analytics_resource_id = _sanitize_log_analytics_resource_id(
            log_analytics_resource_id
        )

        # Get an AAD access token for ARM
        access_token = _get_access_token("https://management.azure.com")
        headers = {
            "Authorization": "Bearer " + access_token
        }  # [SuppressMessage("Microsoft.Security", "CS001:SecretInline", Justification="No secret")]

        # Get the Log Analytics Customer ID from ARM
        response = requests.get(
            "https://management.azure.com"
            + self.log_analytics_resource_id
            + "?api-version=2021-06-01",
            headers=headers,
        )
        response.raise_for_status()
        self.log_analytics_customer_id = response.json()["properties"]["customerId"]

        # Get the Log Analytics Shared Key from ARM
        response = requests.post(
            "https://management.azure.com"
            + self.log_analytics_resource_id
            + "/sharedKeys?api-version=2020-08-01",
            headers=headers,
        )
        response.raise_for_status()
        self.log_analytics_shared_key = response.json()["primarySharedKey"]

        _logger.debug(f"Azure ML Workspace ARM Resource ID: {_azure_ml_resource_id}")
        _logger.debug(
            f"Log Analytics Workspace ARM Resource ID: {self.log_analytics_resource_id}"
        )
        _logger.debug(f"Log Analytics Customer ID: {self.log_analytics_customer_id}")

    # From: https://docs.microsoft.com/en-us/azure/azure-monitor/logs/data-collector-api#python-sample
    def _build_signature(self, date, content_length, method, content_type, resource):
        x_headers = "x-ms-date:" + date
        string_to_hash = (
            method
            + "\n"
            + str(content_length)
            + "\n"
            + content_type
            + "\n"
            + x_headers
            + "\n"
            + resource
        )
        bytes_to_hash = bytes(string_to_hash, encoding="utf-8")
        decoded_key = base64.b64decode(self.log_analytics_shared_key)
        encoded_hash = base64.b64encode(
            hmac.new(decoded_key, bytes_to_hash, digestmod=hashlib.sha256).digest()
        ).decode()
        authorization = "SharedKey {}:{}".format(
            self.log_analytics_customer_id, encoded_hash
        )
        return authorization

    def send(self, log_type, data):
        body = json.dumps(data)
        method = "POST"
        content_type = "application/json"
        resource = "/api/logs"
        rfc1123date = datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S GMT")
        content_length = len(body)
        signature = self._build_signature(
            rfc1123date, content_length, method, content_type, resource
        )

        headers = {
            "content-type": content_type,
            "Authorization": signature,
            "Log-Type": log_type,
            "x-ms-date": rfc1123date,
        }

        response = requests.post(
            "https://"
            + self.log_analytics_customer_id
            + ".ods.opinsights.azure.com"
            + resource
            + "?api-version=2016-04-01",
            data=body,
            headers=headers,
        )
        response.raise_for_status()
        _logger.info(
            f"Sent {len(data)} telemetry row(s) to table {log_type} in Log Analytics workspace {self.log_analytics_resource_id}"
        )
        _logger.debug(f"Telemetry rows: {data}")


def _send_health(telemetry, type_, status=None, details=None):
    telemetry.send(
        "AmlSecurityComputeHealth",
        [
            {
                "WorkspaceId": _azure_ml_resource_id,
                "Computer": _computer,
                "Type": type_,  # Enum: Heartbeat, ScanMalware, ScanOsVulnerabilities, ScanPythonVulnerabilities
                "Status": status
                if status is not None
                else "",  # Enum: Started, Succeeded, Failed, ''
                "Details": json.dumps(details) if details is not None else "",
            }
        ],
    )


def _send_assessment(telemetry, type_, findings, details=None):
    telemetry.send(
        "AmlSecurityComputeAssessments",
        [
            {
                "WorkspaceId": _azure_ml_resource_id,
                "Computer": _computer,
                "Type": type_,  # Enum: Malware, OsVulnerabilities, PythonVulnerabilities
                "Status": "Healthy" if findings == 0 else "Unhealthy",
                "Findings": findings,
                "Details": json.dumps(details) if details is not None else "",
            }
        ],
    )


def _get_log_analytics_from_diagnostic_settings():
    # Get an AAD access token for ARM
    access_token = _get_access_token("https://management.azure.com")
    headers = {
        "Authorization": "Bearer " + access_token
    }  # [SuppressMessage("Microsoft.Security", "CS001:SecretInline", Justification="No secret")]

    # List diagnostic settings on the Azure ML workspace
    response = requests.get(
        "https://management.azure.com"
        + _azure_ml_resource_id
        + "/providers/microsoft.insights/diagnosticSettings?api-version=2021-05-01-preview",
        headers=headers,
    )
    response.raise_for_status()

    # Select the first Log Analytics workspace
    for settings in response.json()["value"]:
        if "workspaceId" in settings["properties"]:
            return settings["properties"]["workspaceId"]
    return None


def _install(log_analytics_resource_id):
    if os.geteuid() != 0:
        raise Exception(
            "Installation must be performed by the root user. Please run again using sudo."
        )

    _logger.debug(f"Creating folder {_config_folder_path}")
    os.makedirs(_config_folder_path, exist_ok=True)
    shutil.chown(_config_folder_path, "azureuser", "azureuser")

    config = {"logAnalyticsResourceId": None}

    # Load config file if present
    if os.path.exists(_local_config_path):
        _logger.debug(f"Loading configuration from {_local_config_path}")
        with open(_local_config_path, "rt") as file:
            config.update(json.load(file))
        _logger.debug(
            f"logAnalyticsResourceId after loading config file: {config['logAnalyticsResourceId']}"
        )

    # Set Log Analytics workspace ARM Resource ID if passed via command-line parameter
    if log_analytics_resource_id is not None:
        config["logAnalyticsResourceId"] = log_analytics_resource_id
        _logger.debug(
            f"logAnalyticsResourceId after setting command-line parameter: {config['logAnalyticsResourceId']}"
        )

    # Retrieve Log Analytics workspace ARM Resource ID from Azure ML diagnostic settings if
    # provided neither via local config file nor command-line parameter
    if config.get("logAnalyticsResourceId", None) is None:
        config["logAnalyticsResourceId"] = _get_log_analytics_from_diagnostic_settings()
        _logger.debug(
            f"logAnalyticsResourceId after querying Azure ML diagnostic settings: {config['logAnalyticsResourceId']}"
        )

    # Sanitize the Log Analytics workspace ARM Resource ID
    config["logAnalyticsResourceId"] = _sanitize_log_analytics_resource_id(
        config["logAnalyticsResourceId"]
    )

    _logger.debug(f"Configuration: {config}")

    _logger.info(f"Writing configuration file {_global_config_path}")
    with open(_global_config_path, "wt") as file:
        json.dump(config, file, indent=2)
    shutil.chown(_global_config_path, "azureuser", "azureuser")

    _logger.info("Installing Trivy")
    _run(
        "apt-get install -y --no-install-recommends --quiet wget apt-transport-https gnupg lsb-release"
    )
    _run(
        "wget -qO - https://aquasecurity.github.io/trivy-repo/deb/public.key | apt-key add -"
    )
    _run(
        "echo deb https://aquasecurity.github.io/trivy-repo/deb $(lsb_release -sc) main | tee -a /etc/apt/sources.list.d/trivy.list"
    )
    _run("apt-get update")
    _run("apt-get install -y --no-install-recommends --quiet trivy")

    script_path = _config_folder_path + "/run.sh"
    _logger.info(f"Writing script file {script_path}")
    with open(script_path, "wt") as file:
        file.write(
            f"""#!/bin/bash
set -e
exec 1> >(logger -s -t AMLSECSCAN) 2>&1

# Limit CPU usage to 20% and reduce priority (note: the configuration is not persisted during reboot)
if [ ! -d /sys/fs/cgroup/cpu/amlsecscan ]
then
    mkdir -p /sys/fs/cgroup/cpu/amlsecscan
    echo 100000 | tee /sys/fs/cgroup/cpu/amlsecscan/cpu.cfs_period_us > /dev/null
    echo 20000 | tee /sys/fs/cgroup/cpu/amlsecscan/cpu.cfs_quota_us > /dev/null
    echo 5 | tee /sys/fs/cgroup/cpu/amlsecscan/cpu.shares > /dev/null
fi
echo $$ | tee /sys/fs/cgroup/cpu/amlsecscan/tasks > /dev/null

nice -n 19 python3 {os.path.abspath(__file__)} $1 $2 $3 $4 $5
"""
        )
    os.chmod(script_path, 0o0755)

    _logger.info(f"Writing crontab file /etc/cron.d/amlsecscan")
    with open("/etc/cron.d/amlsecscan", "wt") as file:
        file.write(
            f"""*/10 * * * * root {script_path} heartbeat
37 5 * * * root {script_path} scan all
@reboot root sleep 600 && {script_path} scan all
"""
        )
    os.chmod("/etc/cron.d/amlsecscan", 0o0644)


def _uninstall():
    if os.geteuid() != 0:
        raise Exception(
            "Uninstallation must be performed by the root user. Please run again using sudo."
        )

    _logger.info(f"Deleting crontab file /etc/cron.d/amlsecscan")
    _run("rm -f /etc/cron.d/amlsecscan")

    _logger.info(f"Deleting folder {_config_folder_path}")
    shutil.rmtree(_config_folder_path, ignore_errors=True)


def _sanitize_log_analytics_resource_id(log_analytics_resource_id):
    if log_analytics_resource_id is None:
        raise ValueError(
            "Log Analytics Workspace ARM Resource ID missing. Please provide it either via config file, command-line parameter, or Azure ML diagnostic settings."
        )

    log_analytics_resource_id = log_analytics_resource_id.strip()

    if len(log_analytics_resource_id.split("/")) != 9:
        raise ValueError(
            "Log Analytics Workspace ARM Resource ID format should be /subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.OperationalInsights/workspaces/{workspace} instead of '"
            + log_analytics_resource_id
            + "'"
        )

    return log_analytics_resource_id


def _get_auth_environment_variables():
    out = _run("cat /etc/environment.sso")
    return {
        pair[0]: pair[1]
        for pair in [line.split("=", 2) for line in out.stdout.splitlines()]
    }


def _parse_clamav_stdout(stdout):

    files = []
    details = {}
    findings = 0

    for line in stdout.splitlines():

        match = re.match(r"^(.+?):\s*(.+?)\s+FOUND", line)
        if match is not None:
            files.append({"path": match.group(1), "malwareType": match.group(2)})
            continue

        match = re.match(r"Infected files:\s*(\d+)", line)
        if match is not None:
            findings = int(match.group(1))
            continue

        match = re.match(r"Known viruses:\s*(\d+)", line)
        if match is not None:
            details["knownViruses"] = int(match.group(1))
            continue

        match = re.match(r"Engine version:\s*(.+)", line)
        if match is not None:
            details["engineVersion"] = match.group(1)
            continue

        match = re.match(r"Scanned files:\s*(\d+)", line)
        if match is not None:
            details["scannedFiles"] = int(match.group(1))
            continue

        match = re.match(r"Scanned directories:\s*(\d+)", line)
        if match is not None:
            details["scannedDirectories"] = int(match.group(1))
            continue

    if findings != len(files):
        raise Exception(
            f"Failed to parse ClamAV stdout (findings: {findings}, files: {len(files)})"
        )

    if len(files) > 0:
        details["files"] = files

    return (findings, details)


def _parse_trivy_results(trivy_scan_path):

    findings_os = []
    findings_python = []
    with open(trivy_scan_path, "rt") as file:
        data = json.load(file)

        for result in data["Results"]:
            if result["Class"] == "os-pkgs":
                for vulnerability in result.get("Vulnerabilities", []):
                    findings_os.append(
                        {
                            "title": vulnerability.get(
                                "Title",
                                vulnerability["PkgName"]
                                + " "
                                + vulnerability["VulnerabilityID"],
                            ),
                            "packageName": vulnerability["PkgName"],
                            "packageVersion": vulnerability["InstalledVersion"],
                            "CVE": vulnerability["VulnerabilityID"],
                            "severity": vulnerability["Severity"],
                        }
                    )
            elif result["Class"] == "lang-pkgs" and result["Type"] == "pip":
                for vulnerability in result.get("Vulnerabilities", []):
                    findings_python.append(
                        {
                            "title": vulnerability.get(
                                "Title",
                                vulnerability["PkgName"]
                                + " "
                                + vulnerability["VulnerabilityID"],
                            ),
                            "packageName": vulnerability["PkgName"],
                            "packageVersion": vulnerability["InstalledVersion"],
                            "file": result["Target"],
                            "CVE": vulnerability["VulnerabilityID"],
                            "severity": vulnerability["Severity"],
                        }
                    )
            else:
                _logger.warning(
                    f"Skipping unhandled vulnerability of class {result['Class']} and type {result['Type']} for file {result['Target']}. "
                )

    return (findings_os, findings_python)


# Limit the finding list to top 50 by severity so that the Log Analytics limit of 32K string length is not hit (which truncates JSON strings and makes them invalid)
def _filter_trivy_results(findings):
    return sorted(
        findings,
        key=lambda x: 0
        if x["severity"] == "CRITICAL"
        else 1
        if x["severity"] == "HIGH"
        else 2,
    )[:50]


def _scan_vulnerabilities(telemetry):

    start_time = time.time()
    _send_health(telemetry, "ScanVulnerabilities", "Started")

    try:
        shutil.rmtree(f"{_config_folder_path}/anaconda", ignore_errors=True)
        for env_name in (
            entry.name for entry in os.scandir("/anaconda/envs") if entry.is_dir()
        ):
            _logger.info(
                f"Saving pip freeze of conda environment {env_name} to {_config_folder_path}/anaconda/{env_name}/requirements.txt"
            )
            os.makedirs(f"{_config_folder_path}/anaconda/{env_name}", exist_ok=True)
            _run(
                f"/anaconda/envs/{env_name}/bin/python3 -m pip freeze > {_config_folder_path}/anaconda/{env_name}/requirements.txt"
            )

        _logger.info("Running Trivy scan")
        _run(
            f"/usr/local/bin/trivy filesystem --format json --output {_config_folder_path}/trivy.json --security-checks vuln --severity HIGH,CRITICAL --ignore-unfixed /"
        )

        findings_os, findings_python = _parse_trivy_results(
            f"{_config_folder_path}/trivy.json"
        )

        _send_assessment(
            telemetry,
            "OsVulnerabilities",
            len(findings_os),
            {"findings": _filter_trivy_results(findings_os)}
            if len(findings_os) > 0
            else None,
        )
        _send_assessment(
            telemetry,
            "PythonVulnerabilities",
            len(findings_python),
            {"findings": _filter_trivy_results(findings_python)}
            if len(findings_python) > 0
            else None,
        )
        _send_health(
            telemetry,
            "ScanVulnerabilities",
            "Succeeded",
            {"elapsedTimeInS": time.time() - start_time},
        )
        return True

    except subprocess.CalledProcessError as e:
        _send_health(
            telemetry,
            "ScanVulnerabilities",
            "Failed",
            {
                "error": str(e),
                "stdout": e.stdout,
                "stderr": e.stderr,
                "elapsedTimeInS": time.time() - start_time,
            },
        )
        return False
    except Exception as e:
        _logger.exception(f"Error: {e}")
        _send_health(
            telemetry,
            "ScanVulnerabilities",
            "Failed",
            {"error": str(e), "elapsedTimeInS": time.time() - start_time},
        )
        return False


def _scan_malware(telemetry):

    start_time = time.time()
    _send_health(telemetry, "ScanMalware", "Started")

    try:

        # Run ClamAV (with AzSecPack malware definitions if present)
        database_option = (
            "-d /var/lib/azsec-clamav"
            if os.path.exists("/var/lib/azsec-clamav")
            else ""
        )
        command = (
            f"clamscan {database_option} -r -i --exclude-dir=^/sys/ /bin /boot /home /lib /lib64 /opt /root /sbin /anaconda",
        )
        _logger.info(f"Running: {command}")
        out = _run(command, check=False)

        # returncode:
        # == 0 -> clamscan completed scan without finding malware
        # == 1 -> clamscan completed scan with malware found
        # >= 2 -> clamscan failed to scan
        if out.returncode >= 2:
            raise Exception(f"Scan failed with exit code {out.returncode}")

        findings, details = _parse_clamav_stdout(out.stdout)

        if findings == 0 and out.returncode != 0:
            raise Exception(
                f"Failed to parse ClamAV stdout (findings: {findings}, exit code: {out.returncode})"
            )

        _send_assessment(telemetry, "Malware", findings, details)
        _send_health(
            telemetry,
            "ScanMalware",
            "Succeeded",
            {"elapsedTimeInS": time.time() - start_time},
        )
        return True

    except subprocess.CalledProcessError as e:
        _send_health(
            telemetry,
            "ScanMalware",
            "Failed",
            {
                "error": str(e),
                "stdout": e.stdout,
                "stderr": e.stderr,
                "elapsedTimeInS": time.time() - start_time,
            },
        )
        return False
    except Exception as e:
        _logger.exception(e)
        _send_health(
            telemetry,
            "ScanMalware",
            "Failed",
            {"error": str(e), "elapsedTimeInS": time.time() - start_time},
        )
        return False


def _add_common_arguments(parser):
    parser.add_argument(
        "-la",
        "--log-analytics-resource-id",
        help="ARM Resource ID of the Log Analytics workspace to log telemetry to",
        dest="log_analytics_resource_id",
    )
    parser.add_argument(
        "-ll",
        "--log-level",
        help="level of log messages to display (default: INFO)",
        dest="log_level",
        choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"],
    )
    parser.add_argument(
        "-o",
        "--output",
        help="output (default: log-analytics)",
        dest="output",
        choices=["log-analytics", "stdout"],
    )


if __name__ == "__main__":
    # Logging to stdout (forwarded to syslog in run.sh)
    _logger.setLevel(logging.INFO)
    _logger.addHandler(logging.StreamHandler(stream=sys.stdout))

    try:
        # Command-line parser
        parser = argparse.ArgumentParser(
            description="Azure ML Compute Security Scanner"
        )
        subparsers = parser.add_subparsers(dest="command")

        # Command: "install"
        parser_install = subparsers.add_parser(
            "install",
            help="Install dependencies and start scheduled scans. Must be run as root (use sudo).",
        )
        _add_common_arguments(parser_install)

        # Command: "uninstall"
        parser_uninstall = subparsers.add_parser(
            "uninstall", help="Remove scheduled scans. Must be run as root (use sudo)."
        )
        _add_common_arguments(parser_uninstall)

        # Command: "heartbeat"
        parser_heartbeat = subparsers.add_parser(
            "heartbeat", help="Emit a telemetry heartbeat"
        )
        _add_common_arguments(parser_heartbeat)

        # Command: "scan"
        parser_scan = subparsers.add_parser("scan", help="Run security scans")
        subparsers_scan = parser_scan.add_subparsers(dest="scan_type")

        # Command: "scan all"
        parser_scan_all = subparsers_scan.add_parser(
            "all", help="Run all security scans"
        )
        _add_common_arguments(parser_scan_all)

        # Command: "scan malware"
        parser_scan_malware = subparsers_scan.add_parser(
            "malware", help="Scan for malware"
        )
        _add_common_arguments(parser_scan_malware)

        # Command: "scan vulnerabilities"
        parser_scan_vulnerabilities = subparsers_scan.add_parser(
            "vulnerabilities", help="Scan for OS and Python vulnerabilities"
        )
        _add_common_arguments(parser_scan_vulnerabilities)

        args = parser.parse_args()

        if args.command is None:
            parser.print_help()
            exit(1)

        if "log_level" in args and args.log_level is not None:
            _logger.setLevel(getattr(logging, args.log_level))

        if args.command == "install":
            _install(args.log_analytics_resource_id)
        elif args.command == "uninstall":
            _uninstall()
        elif args.command == "heartbeat":
            telemetry = (
                StdOutTelemetry()
                if args.output == "stdout"
                else LogAnalyticsTelemetry(args.log_analytics_resource_id)
            )
            _send_health(telemetry, "Heartbeat")
        elif args.command == "scan":
            if args.scan_type is None:
                parser.print_help()
                exit(1)
            telemetry = (
                StdOutTelemetry()
                if args.output == "stdout"
                else LogAnalyticsTelemetry(args.log_analytics_resource_id)
            )
            if args.scan_type == "all":
                success0 = _scan_vulnerabilities(telemetry)
                success1 = _scan_malware(telemetry)
                exit(0 if success0 and success1 else 2)
                # TODO: Python vulns
            elif args.scan_type == "vulnerabilities":
                success = _scan_vulnerabilities(telemetry)
                exit(0 if success else 2)
            elif args.scan_type == "malware":
                success = _scan_malware(telemetry)
                exit(0 if success else 2)
            else:
                raise ValueError(f"Insupported scan type '{args.scan_type}'")
        else:
            raise ValueError(f"Insupported command '{args.command}'")
    except Exception as e:
        _logger.critical(f"Unhandled exception: {e}")
        raise