"""EMR (Elastic Map Reduce) module."""

from __future__ import annotations

import logging
import pprint
import re
from typing import Any, Literal, cast

import boto3

from awswrangler import _utils, exceptions, sts

_logger: logging.Logger = logging.getLogger(__name__)


_ActionOnFailureLiteral = Literal["TERMINATE_JOB_FLOW", "TERMINATE_CLUSTER", "CANCEL_AND_WAIT", "CONTINUE"]


def _get_ecr_credentials_refresh_content(region: str) -> str:
    return f"""
import subprocess
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ECR Setup Job").getOrCreate()

COMMANDS = [
    "sudo -s eval $(aws ecr get-login --region {region} --no-include-email)",
    "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/"
]

for command in COMMANDS:
    subprocess.run(command.split(" "), timeout=6.0, check=True)

print("done!")
    """


def _get_default_logging_path(
    subnet_id: str | None = None,
    account_id: str | None = None,
    region: str | None = None,
    boto3_session: boto3.Session | None = None,
) -> str:
    """Get EMR default logging path.

    E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Parameters
    ----------
    subnet_id : str, optional
        Subnet ID. If not provided, you must pass `account_id` and `region` explicit.
    account_id: str, optional
        Account ID.
    region: str, optional
        Region e.g. 'us-east-1'
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Default logging path.
        E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Examples
    --------
    >>> import awswrangler as wr
    >>> state = wr.emr._get_default_logging_path("subnet-id")
    's3://aws-logs-{account_id}-{region}/elasticmapreduce/'

    """
    if account_id is None:
        _account_id: str = sts.get_account_id(boto3_session=boto3_session)
    else:
        _account_id = account_id
    if (region is None) and (subnet_id is not None):
        _region: str = _utils.get_region_from_session(boto3_session=boto3_session)
    elif (region is None) and (subnet_id is None):
        raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.")
    else:
        _region = cast(str, region)
    return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"


def _get_emr_classification_lib(emr_version: str) -> str:
    """Parse emr release string.

    Parse emr release string and return its corresponding Classification
    configuration string. i.e. log4j or log4j2.

    Parameters
    ----------
        emr_version: emr release string

    Returns
    -------
        A string mentioning the appropriate classification lib based on the emr release.
    """
    matches = re.findall(r"(\d.\d.\d)", emr_version)
    number = 670
    if matches:
        number = int(matches[0].replace(".", ""))

    return "spark-log4j2" if number > 670 else "spark-log4j"


def _build_cluster_args(**pars: Any) -> dict[str, Any]:  # noqa: PLR0912,PLR0915
    account_id: str = sts.get_account_id(boto3_session=pars["boto3_session"])
    region: str = _utils.get_region_from_session(boto3_session=pars["boto3_session"])

    # S3 Logging path
    if pars.get("logging_s3_path") is None:
        pars["logging_s3_path"] = _get_default_logging_path(
            subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"]
        )

    spark_env: dict[str, str] | None = None
    yarn_env: dict[str, str] | None = None
    livy_env: dict[str, str] | None = None

    if pars["spark_pyarrow"] is True:
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {"spark.sql.execution.arrow.enabled": "true"}
        else:
            pars["spark_defaults"]["spark.sql.execution.arrow.enabled"] = "true"
        spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}

    if pars["python3"] is True:
        if spark_env is None:
            spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"}
        else:
            spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3"

    if pars["spark_jars_path"] is not None:
        paths: str = ",".join(pars["spark_jars_path"])
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {"spark.jars": paths}
        else:
            pars["spark_defaults"]["spark.jars"] = paths

    args: dict[str, Any] = {
        "Name": pars["cluster_name"],
        "LogUri": pars["logging_s3_path"],
        "ReleaseLabel": pars["emr_release"],
        "VisibleToAllUsers": pars["visible_to_all_users"],
        "JobFlowRole": pars["emr_ec2_role"],
        "ServiceRole": pars["emr_role"],
        "Instances": {
            "KeepJobFlowAliveWhenNoSteps": pars["keep_cluster_alive_when_no_steps"],
            "TerminationProtected": pars["termination_protected"],
            "Ec2SubnetId": pars["subnet_id"],
            "InstanceFleets": [],
        },
        "StepConcurrencyLevel": pars["step_concurrency_level"],
    }

    # Auto Termination Policy
    if pars["auto_termination_policy"] is not None:
        args["AutoTerminationPolicy"] = pars["auto_termination_policy"]

    # Custom AMI
    if pars["custom_ami_id"] is not None:
        args["CustomAmiId"] = pars["custom_ami_id"]

    # EC2 Key Pair
    if pars["key_pair_name"] is not None:
        args["Instances"]["Ec2KeyName"] = pars["key_pair_name"]

    # Security groups
    if pars["security_group_master"] is not None:
        args["Instances"]["EmrManagedMasterSecurityGroup"] = pars["security_group_master"]
    if pars["security_groups_master_additional"] is not None:
        args["Instances"]["AdditionalMasterSecurityGroups"] = pars["security_groups_master_additional"]
    if pars["security_group_slave"] is not None:
        args["Instances"]["EmrManagedSlaveSecurityGroup"] = pars["security_group_slave"]
    if pars["security_groups_slave_additional"] is not None:
        args["Instances"]["AdditionalSlaveSecurityGroups"] = pars["security_groups_slave_additional"]
    if pars["security_group_service_access"] is not None:
        args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"]

    # Configurations
    args["Configurations"] = (
        [
            {
                "Classification": _get_emr_classification_lib(pars["emr_release"]),
                "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"},
            }
        ]
        if not pars["configurations"]
        else pars["configurations"]
    )
    if pars["docker"] is True:
        if pars.get("extra_public_registries") is None:
            extra_public_registries: list[str] = []
        else:
            extra_public_registries = pars["extra_public_registries"]
        registries: str = (
            f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_public_registries)}"
        )
        registries = registries[:-1] if registries.endswith(",") else registries
        args["Configurations"].append(
            {
                "Classification": "container-executor",
                "Properties": {},
                "Configurations": [
                    {
                        "Classification": "docker",
                        "Properties": {
                            "docker.privileged-containers.registries": registries,
                            "docker.trusted.registries": registries,
                        },
                        "Configurations": [],
                    }
                ],
            }
        )

    if spark_env is not None:
        args["Configurations"].append(
            {
                "Classification": "spark-env",
                "Properties": {},
                "Configurations": [{"Classification": "export", "Properties": spark_env, "Configurations": []}],
            }
        )
    if yarn_env is not None:
        args["Configurations"].append(
            {
                "Classification": "yarn-env",
                "Properties": {},
                "Configurations": [{"Classification": "export", "Properties": yarn_env, "Configurations": []}],
            }
        )
    if livy_env is not None:
        args["Configurations"].append(
            {
                "Classification": "livy-env",
                "Properties": {},
                "Configurations": [{"Classification": "export", "Properties": livy_env, "Configurations": []}],
            }
        )
    if pars["spark_glue_catalog"] is True:
        args["Configurations"].append(
            {
                "Classification": "spark-hive-site",
                "Properties": {
                    "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
                },
                "Configurations": [],
            }
        )
    if pars["hive_glue_catalog"] is True:
        hive_conf: dict[str, Any] = {"Classification": "hive-site", "Properties": {}, "Configurations": []}
        hive_conf["Properties"]["hive.metastore.client.factory.class"] = (
            "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
        )
        args["Configurations"].append(hive_conf)
    if pars["presto_glue_catalog"] is True:
        args["Configurations"].append(
            {
                "Classification": "presto-connector-hive",
                "Properties": {"hive.metastore.glue.datacatalog.enabled": "true"},
                "Configurations": [],
            }
        )
    if pars["consistent_view"] is True:
        args["Configurations"].append(
            {
                "Classification": "emrfs-site",
                "Properties": {
                    "fs.s3.consistent.retryPeriodSeconds": str(pars.get("consistent_view_retry_seconds", "10")),
                    "fs.s3.consistent": "true",
                    "fs.s3.consistent.retryCount": str(pars.get("consistent_view_retry_count", "5")),
                    "fs.s3.consistent.metadata.tableName": pars.get("consistent_view_table_name", "EmrFSMetadata"),
                },
            }
        )
    if pars["maximize_resource_allocation"] is True:
        args["Configurations"].append({"Classification": "spark", "Properties": {"maximizeResourceAllocation": "true"}})
    if pars["spark_defaults"] is not None:
        spark_defaults: dict[str, str | dict[str, str]] = {
            "Classification": "spark-defaults",
            "Properties": pars["spark_defaults"],
        }
        args["Configurations"].append(spark_defaults)
    if pars.get("custom_classifications") is not None:
        for c in pars["custom_classifications"]:
            args["Configurations"].append(c)

    # Applications
    if pars["applications"]:
        args["Applications"] = [{"Name": x} for x in pars["applications"]]

    # Bootstraps
    if pars["bootstraps_paths"]:
        args["BootstrapActions"] = [{"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]]

    # Debugging and Steps
    if (pars["debugging"] is True) or (pars["steps"] is not None):
        args["Steps"] = []
        if pars["debugging"] is True:
            args["Steps"].append(
                {
                    "Name": "Setup Hadoop Debugging",
                    "ActionOnFailure": "TERMINATE_CLUSTER",
                    "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]},
                }
            )
        if pars["steps"] is not None:
            args["Steps"] += pars["steps"]

    # Master Instance Fleet
    timeout_action_master: str = (
        "SWITCH_TO_ON_DEMAND" if pars["spot_timeout_to_on_demand_master"] else "TERMINATE_CLUSTER"
    )
    fleet_master: dict[str, Any] = {
        "Name": "MASTER",
        "InstanceFleetType": "MASTER",
        "TargetOnDemandCapacity": pars["instance_num_on_demand_master"],
        "TargetSpotCapacity": pars["instance_num_spot_master"],
        "InstanceTypeConfigs": [
            {
                "InstanceType": pars["instance_type_master"],
                "WeightedCapacity": 1,
                "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_master"],
                "EbsConfiguration": {
                    "EbsBlockDeviceConfigs": [
                        {
                            "VolumeSpecification": {"SizeInGB": pars["instance_ebs_size_master"], "VolumeType": "gp2"},
                            "VolumesPerInstance": 1,
                        }
                    ],
                    "EbsOptimized": True,
                },
            }
        ],
    }
    if pars["instance_num_spot_master"] > 0:
        fleet_master["LaunchSpecifications"] = {
            "SpotSpecification": {
                "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"],
                "TimeoutAction": timeout_action_master,
            }
        }
    args["Instances"]["InstanceFleets"].append(fleet_master)

    # Core Instance Fleet
    if (pars["instance_num_spot_core"] > 0) or pars["instance_num_on_demand_core"] > 0:
        timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars["spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER"
        fleet_core: dict[str, Any] = {
            "Name": "CORE",
            "InstanceFleetType": "CORE",
            "TargetOnDemandCapacity": pars["instance_num_on_demand_core"],
            "TargetSpotCapacity": pars["instance_num_spot_core"],
            "InstanceTypeConfigs": [
                {
                    "InstanceType": pars["instance_type_core"],
                    "WeightedCapacity": 1,
                    "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_core"],
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [
                            {
                                "VolumeSpecification": {
                                    "SizeInGB": pars["instance_ebs_size_core"],
                                    "VolumeType": "gp2",
                                },
                                "VolumesPerInstance": 1,
                            }
                        ],
                        "EbsOptimized": True,
                    },
                }
            ],
        }
        if pars["instance_num_spot_core"] > 0:
            fleet_core["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"],
                    "TimeoutAction": timeout_action_core,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_core)

    # Task Instance Fleet
    if (pars["instance_num_spot_task"] > 0) or pars["instance_num_on_demand_task"] > 0:
        timeout_action_task: str = (
            "SWITCH_TO_ON_DEMAND" if pars["spot_timeout_to_on_demand_task"] else "TERMINATE_CLUSTER"
        )
        fleet_task: dict[str, Any] = {
            "Name": "TASK",
            "InstanceFleetType": "TASK",
            "TargetOnDemandCapacity": pars["instance_num_on_demand_task"],
            "TargetSpotCapacity": pars["instance_num_spot_task"],
            "InstanceTypeConfigs": [
                {
                    "InstanceType": pars["instance_type_task"],
                    "WeightedCapacity": 1,
                    "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_task"],
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [
                            {
                                "VolumeSpecification": {
                                    "SizeInGB": pars["instance_ebs_size_task"],
                                    "VolumeType": "gp2",
                                },
                                "VolumesPerInstance": 1,
                            }
                        ],
                        "EbsOptimized": True,
                    },
                }
            ],
        }
        if pars["instance_num_spot_task"] > 0:
            fleet_task["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"],
                    "TimeoutAction": timeout_action_task,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_task)

    if pars["security_configuration"]:
        args["SecurityConfiguration"] = pars["security_configuration"]

    # Tags
    if pars["tags"] is not None:
        args["Tags"] = [{"Key": k, "Value": v} for k, v in pars["tags"].items()]

    _logger.debug("args: \n%s", pprint.pformat(args))
    return args


def create_cluster(  # noqa: PLR0913
    subnet_id: str,
    cluster_name: str = "my-emr-cluster",
    logging_s3_path: str | None = None,
    emr_release: str = "emr-6.7.0",
    emr_ec2_role: str = "EMR_EC2_DefaultRole",
    emr_role: str = "EMR_DefaultRole",
    instance_type_master: str = "r5.xlarge",
    instance_type_core: str = "r5.xlarge",
    instance_type_task: str = "r5.xlarge",
    instance_ebs_size_master: int = 64,
    instance_ebs_size_core: int = 64,
    instance_ebs_size_task: int = 64,
    instance_num_on_demand_master: int = 1,
    instance_num_on_demand_core: int = 0,
    instance_num_on_demand_task: int = 0,
    instance_num_spot_master: int = 0,
    instance_num_spot_core: int = 0,
    instance_num_spot_task: int = 0,
    spot_bid_percentage_of_on_demand_master: int = 100,
    spot_bid_percentage_of_on_demand_core: int = 100,
    spot_bid_percentage_of_on_demand_task: int = 100,
    spot_provisioning_timeout_master: int = 5,
    spot_provisioning_timeout_core: int = 5,
    spot_provisioning_timeout_task: int = 5,
    spot_timeout_to_on_demand_master: bool = True,
    spot_timeout_to_on_demand_core: bool = True,
    spot_timeout_to_on_demand_task: bool = True,
    python3: bool = True,
    spark_glue_catalog: bool = True,
    hive_glue_catalog: bool = True,
    presto_glue_catalog: bool = True,
    consistent_view: bool = False,
    consistent_view_retry_seconds: int = 10,
    consistent_view_retry_count: int = 5,
    consistent_view_table_name: str = "EmrFSMetadata",
    bootstraps_paths: list[str] | None = None,
    debugging: bool = True,
    applications: list[str] | None = None,
    visible_to_all_users: bool = True,
    key_pair_name: str | None = None,
    security_group_master: str | None = None,
    security_groups_master_additional: list[str] | None = None,
    security_group_slave: str | None = None,
    security_groups_slave_additional: list[str] | None = None,
    security_group_service_access: str | None = None,
    security_configuration: str | None = None,
    docker: bool = False,
    extra_public_registries: list[str] | None = None,
    spark_log_level: str = "WARN",
    spark_jars_path: list[str] | None = None,
    spark_defaults: dict[str, str] | None = None,
    spark_pyarrow: bool = False,
    custom_classifications: list[dict[str, Any]] | None = None,
    maximize_resource_allocation: bool = False,
    steps: list[dict[str, Any]] | None = None,
    custom_ami_id: str | None = None,
    step_concurrency_level: int = 1,
    keep_cluster_alive_when_no_steps: bool = True,
    termination_protected: bool = False,
    auto_termination_policy: dict[str, int] | None = None,
    tags: dict[str, str] | None = None,
    boto3_session: boto3.Session | None = None,
    configurations: list[dict[str, Any]] | None = None,
) -> str:
    """Create a EMR cluster with instance fleets configuration.

    https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-instance-fleet.html

    Parameters
    ----------
    subnet_id
        VPC subnet ID.
    cluster_name
        Cluster name.
    logging_s3_path
        Logging s3 path (e.g. s3://BUCKET_NAME/DIRECTORY_NAME/).
        If None, the default is `s3://aws-logs-{AccountId}-{RegionId}/elasticmapreduce/`
    emr_release
        EMR release (e.g. emr-5.28.0).
    emr_ec2_role
        IAM role name.
    emr_role
        IAM role name.
    instance_type_master
        EC2 instance type.
    instance_type_core
        EC2 instance type.
    instance_type_task
        EC2 instance type.
    instance_ebs_size_master
        Size of EBS in GB.
    instance_ebs_size_core
        Size of EBS in GB.
    instance_ebs_size_task
        Size of EBS in GB.
    instance_num_on_demand_master
        Number of on demand instances.
    instance_num_on_demand_core
        Number of on demand instances.
    instance_num_on_demand_task
        Number of on demand instances.
    instance_num_spot_master
        Number of spot instances.
    instance_num_spot_core
        Number of spot instances.
    instance_num_spot_task
        Number of spot instances.
    spot_bid_percentage_of_on_demand_master
        The bid price, as a percentage of On-Demand price.
    spot_bid_percentage_of_on_demand_core
        The bid price, as a percentage of On-Demand price.
    spot_bid_percentage_of_on_demand_task
        The bid price, as a percentage of On-Demand price.
    spot_provisioning_timeout_master
        The spot provisioning timeout period in minutes.
        If Spot instances are not provisioned within this time period,
        the TimeOutAction is taken. Minimum value is 5 and maximum value is 1440.
        The timeout applies only during initial provisioning,
        when the cluster is first created.
    spot_provisioning_timeout_core
        The spot provisioning timeout period in minutes.
        If Spot instances are not provisioned within this time period,
        the TimeOutAction is taken. Minimum value is 5 and maximum value is 1440.
        The timeout applies only during initial provisioning,
        when the cluster is first created.
    spot_provisioning_timeout_task
        The spot provisioning timeout period in minutes.
        If Spot instances are not provisioned within this time period,
        the TimeOutAction is taken. Minimum value is 5 and maximum value is 1440.
        The timeout applies only during initial provisioning,
        when the cluster is first created.
    spot_timeout_to_on_demand_master
        After a provisioning timeout should the cluster switch to
        on demand or shutdown?
    spot_timeout_to_on_demand_core
        After a provisioning timeout should the cluster switch to
        on demand or shutdown?
    spot_timeout_to_on_demand_task
        After a provisioning timeout should the cluster switch to
        on demand or shutdown?
    python3
        Python 3 Enabled?
    spark_glue_catalog
        Spark integration with Glue Catalog?
    hive_glue_catalog
        Hive integration with Glue Catalog?
    presto_glue_catalog
        Presto integration with Glue Catalog?
    consistent_view
        Consistent view allows EMR clusters to check for
        list and read-after-write consistency for
        Amazon S3 objects written by or synced with EMRFS.
        https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-consistent-view.html
    consistent_view_retry_seconds
        Delay between the tries (seconds).
    consistent_view_retry_count
        Number of tries.
    consistent_view_table_name
        Name of the DynamoDB table to store the consistent view data.
    bootstraps_paths
        Bootstraps paths (e.g ["s3://BUCKET_NAME/script.sh"]).
    debugging
        Debugging enabled?
    applications
        List of applications (e.g ["Hadoop", "Spark", "Ganglia", "Hive"]).
        If None, ["Spark"] will be considered.
    visible_to_all_users
        True or False.
    key_pair_name
        Key pair name.
    security_group_master
        The identifier of the Amazon EC2 security group for the master node.
    security_groups_master_additional
        A list of additional Amazon EC2 security group IDs for the master node.
    security_group_slave
        The identifier of the Amazon EC2 security group for
        the core and task nodes.
    security_groups_slave_additional
        A list of additional Amazon EC2 security group IDs for
        the core and task nodes.
    security_group_service_access
        The identifier of the Amazon EC2 security group for the Amazon EMR
        service to access clusters in VPC private subnets.
    security_configuration:str, optional
        The name of a security configuration to apply to the cluster.
    docker
        Enable Docker Hub and ECR registries access.
    extra_public_registries
        Additional docker registries.
    spark_log_level
        log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE).
    spark_jars_path
        spark.jars e.g. [s3://.../foo.jar, s3://.../boo.jar]
        https://spark.apache.org/docs/latest/configuration.html
    spark_defaults
        https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#spark-defaults
    spark_pyarrow
        Enable PySpark to use PyArrow behind the scenes.
        P.S. You must install pyarrow by your self via bootstrap
    custom_classifications
        Extra classifications.
    maximize_resource_allocation
        Configure your executors to utilize the maximum resources possible
        https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation
    custom_ami_id
        The custom AMI ID to use for the provisioned instance group
    steps
        Steps definitions (Obs : str Use EMR.build_step() to build it)
    keep_cluster_alive_when_no_steps
        Specifies whether the cluster should
        remain available after completing all steps
    termination_protected
        Specifies whether the Amazon EC2 instances in the cluster are
        protected from termination by API calls, user intervention,
        or in the event of a job-flow error.
    auto_termination_policy
        Specifies the auto-termination policy that is attached to an Amazon EMR cluster
        eg. auto_termination_policy = {'IdleTimeout': 123}
        IdleTimeout specifies the amount of idle time in seconds after which the cluster automatically terminates.
        You can specify a minimum of 60 seconds and a maximum of 604800 seconds (seven days).
    tags
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.
    configurations
        The list of configurations supplied for an EMR cluster instance group.

        By default, adds log4j config as follows:
        `{"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}`

    Returns
    -------
        Cluster ID.

    Examples
    --------
    Minimal Example

    >>> import awswrangler as wr
    >>> cluster_id = wr.emr.create_cluster("SUBNET_ID")

    Minimal Example With Custom Classification

    >>> import awswrangler as wr
    >>> cluster_id = wr.emr.create_cluster(
    >>> subnet_id="SUBNET_ID",
    >>> custom_classifications=[
    >>>         {
    >>>             "Classification": "livy-conf",
    >>>             "Properties": {
    >>>                 "livy.spark.master": "yarn",
    >>>                 "livy.spark.deploy-mode": "cluster",
    >>>                 "livy.server.session.timeout": "16h",
    >>>             },
    >>>         }
    >>>     ],
    >>> )

    Full Example

    >>> import awswrangler as wr
    >>> cluster_id = wr.emr.create_cluster(
    ...     cluster_name="wrangler_cluster",
    ...     logging_s3_path=f"s3://BUCKET_NAME/emr-logs/",
    ...     emr_release="emr-5.28.0",
    ...     subnet_id="SUBNET_ID",
    ...     emr_ec2_role="EMR_EC2_DefaultRole",
    ...     emr_role="EMR_DefaultRole",
    ...     instance_type_master="m5.xlarge",
    ...     instance_type_core="m5.xlarge",
    ...     instance_type_task="m5.xlarge",
    ...     instance_ebs_size_master=50,
    ...     instance_ebs_size_core=50,
    ...     instance_ebs_size_task=50,
    ...     instance_num_on_demand_master=1,
    ...     instance_num_on_demand_core=1,
    ...     instance_num_on_demand_task=1,
    ...     instance_num_spot_master=0,
    ...     instance_num_spot_core=1,
    ...     instance_num_spot_task=1,
    ...     spot_bid_percentage_of_on_demand_master=100,
    ...     spot_bid_percentage_of_on_demand_core=100,
    ...     spot_bid_percentage_of_on_demand_task=100,
    ...     spot_provisioning_timeout_master=5,
    ...     spot_provisioning_timeout_core=5,
    ...     spot_provisioning_timeout_task=5,
    ...     spot_timeout_to_on_demand_master=True,
    ...     spot_timeout_to_on_demand_core=True,
    ...     spot_timeout_to_on_demand_task=True,
    ...     python3=True,
    ...     spark_glue_catalog=True,
    ...     hive_glue_catalog=True,
    ...     presto_glue_catalog=True,
    ...     bootstraps_paths=None,
    ...     debugging=True,
    ...     applications=["Hadoop", "Spark", "Ganglia", "Hive"],
    ...     visible_to_all_users=True,
    ...     key_pair_name=None,
    ...     spark_jars_path=[f"s3://...jar"],
    ...     maximize_resource_allocation=True,
    ...     keep_cluster_alive_when_no_steps=True,
    ...     termination_protected=False,
    ...     spark_pyarrow=True,
    ...     tags={
    ...         "foo": "boo"
    ...     })

    """
    applications = ["Spark"] if applications is None else applications
    args: dict[str, Any] = _build_cluster_args(**locals())
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.run_job_flow(**args)
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["JobFlowId"]


def get_cluster_state(cluster_id: str, boto3_session: boto3.Session | None = None) -> str:
    """Get the EMR cluster state.

    Possible states: 'STARTING', 'BOOTSTRAPPING', 'RUNNING',
    'WAITING', 'TERMINATING',
    'TERMINATED', 'TERMINATED_WITH_ERRORS'

    Parameters
    ----------
    cluster_id
        Cluster ID.
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        State.

    Examples
    --------
    >>> import awswrangler as wr
    >>> state = wr.emr.get_cluster_state("cluster-id")

    """
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.describe_cluster(ClusterId=cluster_id)
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["Cluster"]["Status"]["State"]


def terminate_cluster(cluster_id: str, boto3_session: boto3.Session | None = None) -> None:
    """Terminate EMR cluster.

    Parameters
    ----------
    cluster_id
        Cluster ID.
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.emr.terminate_cluster("cluster-id")

    """
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.terminate_job_flows(JobFlowIds=[cluster_id])
    _logger.debug("response: \n%s", pprint.pformat(response))


def submit_steps(cluster_id: str, steps: list[dict[str, Any]], boto3_session: boto3.Session | None = None) -> list[str]:
    """Submit a list of steps.

    Parameters
    ----------
    cluster_id
        Cluster ID.
    steps
        Steps definitions (Obs: Use EMR.build_step() to build it).
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        List of step IDs.

    Examples
    --------
    >>> import awswrangler as wr
    >>> for cmd in ['echo "Hello"', "ls -la"]:
    ...     steps.append(wr.emr.build_step(name=cmd, command=cmd))
    >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps)

    """
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps)  # type: ignore[arg-type]
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["StepIds"]


def submit_step(
    cluster_id: str,
    command: str,
    name: str = "my-step",
    action_on_failure: _ActionOnFailureLiteral = "CONTINUE",
    script: bool = False,
    boto3_session: boto3.Session | None = None,
) -> str:
    """Submit new job in the EMR Cluster.

    Parameters
    ----------
    cluster_id
        Cluster ID.
    command
        e.g. 'echo "Hello!"'
        e.g. for script 's3://.../script.sh arg1 arg2'
    name
        Step name.
    action_on_failure
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    script
        True for raw command or False for script runner.
        https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        Step ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_step(
    ...     cluster_id=cluster_id,
    ...     name="step_test",
    ...     command="s3://...script.sh arg1 arg2",
    ...     script=True,
    ... )

    """
    step: dict[str, Any] = build_step(
        name=name, command=command, action_on_failure=action_on_failure, script=script, boto3_session=boto3_session
    )
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])  # type: ignore[list-item]
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["StepIds"][0]


def build_step(
    command: str,
    name: str = "my-step",
    action_on_failure: _ActionOnFailureLiteral = "CONTINUE",
    script: bool = False,
    region: str | None = None,
    boto3_session: boto3.Session | None = None,
) -> dict[str, Any]:
    """Build the Step structure (dictionary).

    Parameters
    ----------
    command
        e.g. 'echo "Hello!"'
        e.g. for script 's3://.../script.sh arg1 arg2'
    name
        Step name.
    action_on_failure
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    script
        False for raw command or True for script runner.
        https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
    region
        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        Step structure.

    Examples
    --------
    >>> import awswrangler as wr
    >>> steps = []
    >>> for cmd in ['echo "Hello"', "ls -la"]:
    ...     steps.append(wr.emr.build_step(name=cmd, command=cmd))
    >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps)

    """
    jar: str = "command-runner.jar"
    if script is True:
        if region is not None:
            _region: str = region
        else:
            _region = _utils.get_region_from_session(boto3_session=boto3_session, default_region="us-east-1")
        jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar"
    step: dict[str, Any] = {
        "Name": name,
        "ActionOnFailure": action_on_failure,
        "HadoopJarStep": {"Jar": jar, "Args": command.split(" ")},
    }
    return step


def get_step_state(cluster_id: str, step_id: str, boto3_session: boto3.Session | None = None) -> str:
    """Get EMR step state.

    Possible states: 'PENDING', 'CANCEL_PENDING', 'RUNNING',
    'COMPLETED', 'CANCELLED', 'FAILED', 'INTERRUPTED'

    Parameters
    ----------
    cluster_id
        Cluster ID.
    step_id
        Step ID.
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        State.

    Examples
    --------
    >>> import awswrangler as wr
    >>> state = wr.emr.get_step_state("cluster-id", "step-id")

    """
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id)
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["Step"]["Status"]["State"]


def submit_ecr_credentials_refresh(
    cluster_id: str,
    path: str,
    action_on_failure: _ActionOnFailureLiteral = "CONTINUE",
    boto3_session: boto3.Session | None = None,
) -> str:
    """Update internal ECR credentials.

    Parameters
    ----------
    cluster_id
        Cluster ID.
    path
        Amazon S3 path where awswrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/)
    action_on_failure
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        Step ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/")

    """
    path = path[:-1] if path.endswith("/") else path
    path_script: str = f"{path}/ecr_credentials_refresh.py"
    client_s3 = _utils.client(service_name="s3", session=boto3_session)
    bucket, key = _utils.parse_path(path=path_script)
    region: str = _utils.get_region_from_session(boto3_session=boto3_session)
    client_s3.put_object(
        Body=_get_ecr_credentials_refresh_content(region=region).encode(encoding="utf-8"), Bucket=bucket, Key=key
    )
    command: str = f"spark-submit --deploy-mode cluster {path_script}"
    name: str = "ECR Credentials Refresh"
    step: dict[str, Any] = build_step(
        name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=boto3_session
    )
    client_emr = _utils.client(service_name="emr", session=boto3_session)
    response = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])  # type: ignore[list-item]
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["StepIds"][0]


def build_spark_step(
    path: str,
    args: list[str] | None = None,
    deploy_mode: Literal["cluster", "client"] = "cluster",
    docker_image: str | None = None,
    name: str = "my-step",
    action_on_failure: _ActionOnFailureLiteral = "CONTINUE",
    region: str | None = None,
    boto3_session: boto3.Session | None = None,
) -> dict[str, Any]:
    """Build the Step structure (dictionary).

    Parameters
    ----------
    path
        Script path. (e.g. s3://bucket/app.py)
    args
        CLI args to use with script
    deploy_mode
        "cluster" | "client"
    docker_image
        e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}"
    name
        Step name.
    action_on_failure
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    region
        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        Step structure.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_steps(
    >>>     cluster_id="cluster-id",
    >>>     steps=[
    >>>         wr.emr.build_spark_step(path="s3://bucket/app.py")
    >>>     ]
    >>> )

    """
    script_args = " ".join(args) if args else ""
    if docker_image is None:
        cmd: str = f"spark-submit --deploy-mode {deploy_mode} {path} {script_args}"
    else:
        config: str = "hdfs:///user/hadoop/config.json"
        cmd = (
            f"spark-submit --deploy-mode {deploy_mode} "
            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE=docker "
            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} "
            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} "
            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro "
            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE=docker "
            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} "
            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} "
            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro "
            f"{path} {script_args}"
        )
    return build_step(
        command=cmd,
        name=name,
        action_on_failure=action_on_failure,
        script=False,
        region=region,
        boto3_session=boto3_session,
    )


def submit_spark_step(
    cluster_id: str,
    path: str,
    args: list[str] | None = None,
    deploy_mode: Literal["cluster", "client"] = "cluster",
    docker_image: str | None = None,
    name: str = "my-step",
    action_on_failure: _ActionOnFailureLiteral = "CONTINUE",
    region: str | None = None,
    boto3_session: boto3.Session | None = None,
) -> str:
    """Submit Spark Step.

    Parameters
    ----------
    cluster_id
        Cluster ID.
    path
        Script path. (e.g. s3://bucket/app.py)
    args
        CLI args to use with script
        eg. args = ["--name", "hello-world"]
    deploy_mode
        "cluster" | "client"
    docker_image
        e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}"
    name
        Step name.
    action_on_failure
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    region
        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
    boto3_session
        The default boto3 session will be used if **boto3_session** is ``None``.

    Returns
    -------
        Step ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_spark_step(
    >>>     cluster_id="cluster-id",
    >>>     path="s3://bucket/emr/app.py"
    >>> )

    """
    step = build_spark_step(
        path=path,
        args=args,
        deploy_mode=deploy_mode,
        docker_image=docker_image,
        name=name,
        action_on_failure=action_on_failure,
        region=region,
        boto3_session=boto3_session,
    )
    return submit_steps(cluster_id=cluster_id, steps=[step], boto3_session=boto3_session)[0]