awswrangler/athena/

"""Utilities Module for Amazon Athena.""" from __future__ import annotations import base64 import csv import json import logging import pprint import uuid import warnings from decimal import Decimal from typing import ( TYPE_CHECKING, Any, Dict, Generator, NamedTuple, TypedDict, Union, cast, ) import boto3 import botocore.exceptions import pandas as pd from typing_extensions import Literal from awswrangler import _data_types, _utils, catalog, exceptions, s3, sts, typing from awswrangler._config import apply_configs from awswrangler._sql_formatter import _process_sql_params from awswrangler.catalog._utils import _catalog_id from . import _executions from ._cache import _cache_manager, _LocalMetadataCacheManager if TYPE_CHECKING: from mypy_boto3_athena.type_defs import QueryExecutionTypeDef from mypy_boto3_glue.type_defs import ColumnOutputTypeDef _QUERY_FINAL_STATES: list[str] = ["FAILED", "SUCCEEDED", "CANCELLED"] _QUERY_WAIT_POLLING_DELAY: float = 1.0 # SECONDS _logger: logging.Logger = logging.getLogger(__name__) class _QueryMetadata(NamedTuple): execution_id: str dtype: dict[str, str] parse_timestamps: list[str] parse_dates: list[str] parse_geometry: list[str] converters: dict[str, Any] binaries: list[str] output_location: str | None manifest_location: str | None raw_payload: "QueryExecutionTypeDef" class _WorkGroupConfig(NamedTuple): enforced: bool s3_output: str | None encryption: str | None kms_key: str | None def _get_s3_output( s3_output: str | None, wg_config: _WorkGroupConfig, boto3_session: boto3.Session | None = None ) -> str: if wg_config.enforced and wg_config.s3_output is not None: return wg_config.s3_output if s3_output is not None: return s3_output if wg_config.s3_output is not None: return wg_config.s3_output return create_athena_bucket(boto3_session=boto3_session) def _start_query_execution( sql: str, wg_config: _WorkGroupConfig, database: str | None = None, data_source: str | None = None, s3_output: str | None = None, workgroup: str | None = None, encryption: str | None = None, kms_key: str | None = None, execution_params: list[str] | None = None, client_request_token: str | None = None, boto3_session: boto3.Session | None = None, ) -> str: args: dict[str, Any] = {"QueryString": sql} # s3_output args["ResultConfiguration"] = { "OutputLocation": _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) } # encryption if wg_config.enforced is True: if wg_config.encryption is not None: args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": wg_config.encryption} if wg_config.kms_key is not None: args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = wg_config.kms_key elif encryption is not None: args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": encryption} if kms_key is not None: args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = kms_key # database if database is not None: args["QueryExecutionContext"] = {"Database": database} if data_source is not None: args["QueryExecutionContext"]["Catalog"] = data_source # workgroup if workgroup is not None: args["WorkGroup"] = workgroup if client_request_token: args["ClientRequestToken"] = client_request_token if execution_params: args["ExecutionParameters"] = execution_params client_athena = _utils.client(service_name="athena", session=boto3_session) _logger.debug("Starting query execution with args: \n%s", pprint.pformat(args)) response = _utils.try_it( f=client_athena.start_query_execution, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=5, **args, ) _logger.debug("Query response:\n%s", response) return response["QueryExecutionId"] def _get_workgroup_config(session: boto3.Session | None = None, workgroup: str = "primary") -> _WorkGroupConfig: enforced: bool wg_s3_output: str | None wg_encryption: str | None wg_kms_key: str | None enforced, wg_s3_output, wg_encryption, wg_kms_key = False, None, None, None if workgroup is not None: res = get_work_group(workgroup=workgroup, boto3_session=session) enforced = res["WorkGroup"]["Configuration"]["EnforceWorkGroupConfiguration"] config: dict[str, Any] = res["WorkGroup"]["Configuration"].get("ResultConfiguration") if config is not None: wg_s3_output = config.get("OutputLocation") encrypt_config: dict[str, str] | None = config.get("EncryptionConfiguration") wg_encryption = None if encrypt_config is None else encrypt_config.get("EncryptionOption") wg_kms_key = None if encrypt_config is None else encrypt_config.get("KmsKey") wg_config: _WorkGroupConfig = _WorkGroupConfig( enforced=enforced, s3_output=wg_s3_output, encryption=wg_encryption, kms_key=wg_kms_key ) _logger.debug("Workgroup config:\n%s", wg_config) return wg_config def _fetch_txt_result( query_metadata: _QueryMetadata, keep_files: bool, boto3_session: boto3.Session | None, s3_additional_kwargs: dict[str, str] | None, ) -> pd.DataFrame: if query_metadata.output_location is None or query_metadata.output_location.endswith(".txt") is False: return pd.DataFrame() path: str = query_metadata.output_location _logger.debug("Reading TXT result from %s", path) df = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, skip_blank_lines=True, na_values=[], use_threads=False, boto3_session=boto3_session, names=list(query_metadata.dtype.keys()), sep="\t", ) if keep_files is False: s3.delete_objects( path=[path, f"{path}.metadata"], use_threads=False, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return df def _parse_describe_table(df: pd.DataFrame) -> pd.DataFrame: origin_df_dict = df.to_dict() target_df_dict: dict[str, list[str | bool]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []} for index, col_name in origin_df_dict["col_name"].items(): col_name = col_name.strip() # noqa: PLW2901 if col_name.startswith("#") or not col_name: pass elif col_name in target_df_dict["Column Name"]: index_col_name = target_df_dict["Column Name"].index(col_name) target_df_dict["Partition"][index_col_name] = True else: target_df_dict["Column Name"].append(col_name) target_df_dict["Type"].append(origin_df_dict["data_type"][index].strip()) target_df_dict["Partition"].append(False) target_df_dict["Comment"].append(origin_df_dict["comment"][index].strip()) return pd.DataFrame(data=target_df_dict) def _get_query_metadata( query_execution_id: str, boto3_session: boto3.Session | None = None, categories: list[str] | None = None, query_execution_payload: "QueryExecutionTypeDef" | None = None, metadata_cache_manager: _LocalMetadataCacheManager | None = None, athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY, execution_params: list[str] | None = None, dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable", ) -> _QueryMetadata: """Get query metadata.""" if (query_execution_payload is not None) and (query_execution_payload["Status"]["State"] in _QUERY_FINAL_STATES): if query_execution_payload["Status"]["State"] != "SUCCEEDED": reason: str = query_execution_payload["Status"]["StateChangeReason"] raise exceptions.QueryFailed(f"Query error: {reason}") _query_execution_payload = query_execution_payload else: _query_execution_payload = cast( "QueryExecutionTypeDef", _executions.wait_query( query_execution_id=query_execution_id, boto3_session=boto3_session, athena_query_wait_polling_delay=athena_query_wait_polling_delay, ), ) cols_types: dict[str, str] = get_query_columns_types( query_execution_id=query_execution_id, boto3_session=boto3_session ) _logger.debug("Casting query column types: %s", cols_types) dtype: dict[str, str] = {} parse_timestamps: list[str] = [] parse_dates: list[str] = [] parse_geometry: list[str] = [] converters: dict[str, Any] = {} binaries: list[str] = [] col_name: str col_type: str for col_name, col_type in cols_types.items(): pandas_type: str = _data_types.athena2pandas(dtype=col_type, dtype_backend=dtype_backend) if (categories is not None) and (col_name in categories): dtype[col_name] = "category" elif pandas_type in ["datetime64", "date"]: parse_timestamps.append(col_name) if pandas_type == "date": parse_dates.append(col_name) elif pandas_type == "bytes": dtype[col_name] = "string" binaries.append(col_name) elif pandas_type == "decimal": converters[col_name] = lambda x: Decimal(str(x)) if str(x) not in ("", "none", " ", "<NA>") else None elif col_type == "geometry" and pandas_type == "string": parse_geometry.append(col_name) else: dtype[col_name] = pandas_type output_location: str | None = None if "ResultConfiguration" in _query_execution_payload: output_location = _query_execution_payload["ResultConfiguration"].get("OutputLocation") athena_statistics = _query_execution_payload.get("Statistics", {}) manifest_location: str | None = athena_statistics.get("DataManifestLocation") if metadata_cache_manager is not None and query_execution_id not in metadata_cache_manager: metadata_cache_manager.update_cache(items=[_query_execution_payload]) query_metadata: _QueryMetadata = _QueryMetadata( execution_id=query_execution_id, dtype=dtype, parse_timestamps=parse_timestamps, parse_dates=parse_dates, parse_geometry=parse_geometry, converters=converters, binaries=binaries, output_location=output_location, manifest_location=manifest_location, raw_payload=_query_execution_payload, ) _logger.debug("Query metadata:\n%s", query_metadata) return query_metadata def _empty_dataframe_response( chunked: bool, query_metadata: _QueryMetadata ) -> pd.DataFrame | Generator[None, None, None]: """Generate an empty DataFrame response.""" if chunked is False: df = pd.DataFrame() df = _apply_query_metadata(df=df, query_metadata=query_metadata) return df return _utils.empty_generator() def _apply_query_metadata(df: pd.DataFrame, query_metadata: _QueryMetadata) -> pd.DataFrame: with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) df.query_metadata = query_metadata.raw_payload return df class _FormatterTypeQMark(TypedDict): params: list[str] paramstyle: Literal["qmark"] class _FormatterTypeNamed(TypedDict): params: dict[str, Any] paramstyle: Literal["named"] _FormatterType = Union[_FormatterTypeQMark, _FormatterTypeNamed, None] def _verify_formatter( params: dict[str, Any] | list[str] | None, paramstyle: Literal["qmark", "named"], ) -> _FormatterType: if params is None: return None if paramstyle == "named": if not isinstance(params, dict): raise exceptions.InvalidArgumentCombination( f"`params` must be a dict when paramstyle is `named`. Instead, found type {type(params)}." ) return { "paramstyle": "named", "params": params, } if paramstyle == "qmark": if not isinstance(params, list): raise exceptions.InvalidArgumentCombination( f"`params` must be a list when paramstyle is `qmark`. Instead, found type {type(params)}." ) return { "paramstyle": "qmark", "params": params, } raise exceptions.InvalidArgumentValue(f"`paramstyle` must be either `qmark` or `named`. Found: {paramstyle}.") def _apply_formatter( sql: str, params: dict[str, Any] | list[str] | None, paramstyle: Literal["qmark", "named"], ) -> tuple[str, list[str] | None]: formatter_settings = _verify_formatter(params, paramstyle) if formatter_settings is None: return sql, None if formatter_settings["paramstyle"] == "named": # Substitute query parameters] sql = _process_sql_params(sql, formatter_settings["params"]) return sql, None return sql, formatter_settings["params"] def get_named_query_statement( named_query_id: str, boto3_session: boto3.Session | None = None, ) -> str: """ Get the named query statement string from a query ID. Parameters ---------- named_query_id The unique ID of the query. Used to get the query statement from a saved query. Requires access to the workgroup where the query is saved. boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- The named query statement string """ client_athena = _utils.client(service_name="athena", session=boto3_session) return client_athena.get_named_query(NamedQueryId=named_query_id)["NamedQuery"]["QueryString"] def get_query_columns_types(query_execution_id: str, boto3_session: boto3.Session | None = None) -> dict[str, str]: """Get the data type of all columns queried. https://docs.aws.amazon.com/athena/latest/ug/data-types.html Parameters ---------- query_execution_id Athena query execution ID. boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- Dictionary with all data types. Examples -------- >>> import awswrangler as wr >>> wr.athena.get_query_columns_types('query-execution-id') {'col0': 'int', 'col1': 'double'} """ client_athena = _utils.client(service_name="athena", session=boto3_session) response = client_athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1) col_info = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"] return dict( (c["Name"], f"{c['Type']}({c['Precision']},{c.get('Scale', 0)})") if c["Type"] in ["decimal"] else (c["Name"], c["Type"]) for c in col_info ) def create_athena_bucket(boto3_session: boto3.Session | None = None) -> str: """Create the default Athena bucket if it doesn't exist. Parameters ---------- boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Examples -------- >>> import awswrangler as wr >>> wr.athena.create_athena_bucket() 's3://aws-athena-query-results-ACCOUNT-REGION/' """ account_id: str = sts.get_account_id(boto3_session=boto3_session) region_name: str = _utils.get_region_from_session(boto3_session=boto3_session).lower() bucket_name = f"aws-athena-query-results-{account_id}-{region_name}" path = f"s3://{bucket_name}/" client_s3 = _utils.client(service_name="s3", session=boto3_session) args = {} if region_name == "us-east-1" else {"CreateBucketConfiguration": {"LocationConstraint": region_name}} try: client_s3.create_bucket(Bucket=bucket_name, **args) # type: ignore[arg-type] except (client_s3.exceptions.BucketAlreadyExists, client_s3.exceptions.BucketAlreadyOwnedByYou): _logger.debug("Bucket %s already exists.", bucket_name) except botocore.exceptions.ClientError as err: if err.response["Error"]["Code"] == "OperationAborted": _logger.debug("A conflicting conditional operation is currently in progress against this resource.") client_s3.get_waiter("bucket_exists").wait(Bucket=bucket_name) return path @apply_configs def repair_table( table: str, database: str | None = None, data_source: str | None = None, s3_output: str | None = None, workgroup: str = "primary", encryption: str | None = None, kms_key: str | None = None, athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY, boto3_session: boto3.Session | None = None, ) -> str: """Run the Hive's metastore consistency check: 'MSCK REPAIR TABLE table;'. Recovers partitions and data associated with partitions. Use this statement when you add partitions to the catalog. It is possible it will take some time to add all partitions. If this operation times out, it will be in an incomplete state where only a few partitions are added to the catalog. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table Table name. database AWS Glue/Athena database name. data_source Data Source / Catalog name. If None, 'AwsDataCatalog' is used. s3_output AWS S3 path. workgroup Athena workgroup. Primary by default. encryption None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. athena_query_wait_polling_delay Interval in seconds for how often the function will check if the Athena query has completed. boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- Query final state ('SUCCEEDED', 'FAILED', 'CANCELLED'). Examples -------- >>> import awswrangler as wr >>> query_final_state = wr.athena.repair_table(table='...', database='...') """ query = f"MSCK REPAIR TABLE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" query_id = _executions.start_query_execution( sql=query, database=database, data_source=data_source, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) response: dict[str, Any] = _executions.wait_query( query_execution_id=query_id, boto3_session=boto3_session, athena_query_wait_polling_delay=athena_query_wait_polling_delay, ) return cast(str, response["Status"]["State"]) @apply_configs @_utils.validate_distributed_kwargs( unsupported_kwargs=["boto3_session"], ) def describe_table( table: str, database: str | None = None, s3_output: str | None = None, workgroup: str = "primary", encryption: str | None = None, kms_key: str | None = None, athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY, s3_additional_kwargs: dict[str, Any] | None = None, boto3_session: boto3.Session | None = None, ) -> pd.DataFrame: """Show the list of columns, including partition columns: 'DESCRIBE table;'. Shows the list of columns, including partition columns, for the named column. The result of this function will be equal to `wr.catalog.table`. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table Table name. database AWS Glue/Athena database name. s3_output AWS S3 path. workgroup Athena workgroup. Primary by default. encryption None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. athena_query_wait_polling_delay Interval in seconds for how often the function will check if the Athena query has completed. s3_additional_kwargs Forwarded to botocore requests. e.g. s3_additional_kwargs={'RequestPayer': 'requester'} boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- Pandas DataFrame filled by formatted table information. Examples -------- >>> import awswrangler as wr >>> df_table = wr.athena.describe_table(table='my_table', database='default') """ query = f"DESCRIBE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" query_id = _executions.start_query_execution( sql=query, database=database, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) query_metadata: _QueryMetadata = _get_query_metadata( query_execution_id=query_id, athena_query_wait_polling_delay=athena_query_wait_polling_delay, boto3_session=boto3_session, ) raw_result = _fetch_txt_result( query_metadata=query_metadata, keep_files=True, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return _parse_describe_table(raw_result) @apply_configs def create_ctas_table( sql: str, database: str | None = None, ctas_table: str | None = None, ctas_database: str | None = None, s3_output: str | None = None, storage_format: str | None = None, write_compression: str | None = None, partitioning_info: list[str] | None = None, bucketing_info: typing.BucketingInfoTuple | None = None, field_delimiter: str | None = None, schema_only: bool = False, workgroup: str = "primary", data_source: str | None = None, encryption: str | None = None, kms_key: str | None = None, categories: list[str] | None = None, wait: bool = False, athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY, execution_params: list[str] | None = None, params: dict[str, Any] | list[str] | None = None, paramstyle: Literal["qmark", "named"] = "named", boto3_session: boto3.Session | None = None, ) -> dict[str, str | _QueryMetadata]: """Create a new table populated with the results of a SELECT query. https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html Parameters ---------- sql SELECT SQL query. database The name of the database where the original table is stored. ctas_table The name of the CTAS table. If None, a name with a random string is used. ctas_database The name of the alternative database where the CTAS table should be stored. If None, `database` is used, that is the CTAS table is stored in the same database as the original table. s3_output The output Amazon S3 path. If None, either the Athena workgroup or client-side location setting is used. If a workgroup enforces a query results location, then it overrides this argument. storage_format The storage format for the CTAS query results, such as ORC, PARQUET, AVRO, JSON, or TEXTFILE. PARQUET by default. write_compression The compression type to use for any storage format that allows compression to be specified. partitioning_info A list of columns by which the CTAS table will be partitioned. bucketing_info Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. field_delimiter The single-character field delimiter for files in CSV, TSV, and text files. schema_only _description_, by default False workgroup Athena workgroup. Primary by default. data_source Data Source / Catalog name. If None, 'AwsDataCatalog' is used. encryption Valid values: [None, 'SSE_S3', 'SSE_KMS']. Note: 'CSE_KMS' is not supported. kms_key For SSE-KMS, this is the KMS key ARN or ID. categories List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. wait Whether to wait for the query to finish and return a dictionary with the Query metadata. athena_query_wait_polling_delay Interval in seconds for how often the function will check if the Athena query has completed. execution_params [**DEPRECATED**] A list of values for the parameters that are used in the SQL query. This parameter is on a deprecation path. Use ``params`` and `paramstyle`` instead. params Dictionary or list of parameters to pass to execute method. The syntax used to pass parameters depends on the configuration of ``paramstyle``. paramstyle The syntax style to use for the parameters. Supported values are ``named`` and ``qmark``. The default is ``named``. boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- A dictionary with the the CTAS database and table names. If `wait` is `False`, the query ID is included, otherwise a Query metadata object is added instead. Examples -------- Select all into a new table and encrypt the results >>> import awswrangler as wr >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... encryption="SSE_KMS", ... kms_key="1234abcd-12ab-34cd-56ef-1234567890ab", ... ) {'ctas_database': 'default', 'ctas_table': 'temp_table_5669340090094....', 'ctas_query_id': 'cc7dfa81-831d-...'} Create a table with schema only >>> wr.athena.create_ctas_table( ... sql="select col1, col2 from table", ... database="default", ... ctas_table="my_ctas_table", ... schema_only=True, ... wait=True, ... ) Partition data and save to alternative CTAS database >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... ctas_database="my_ctas_db", ... storage_format="avro", ... write_compression="snappy", ... partitioning_info=["par0", "par1"], ... wait=True, ... ) """ ctas_table = catalog.sanitize_table_name(ctas_table) if ctas_table else f"temp_table_{uuid.uuid4().hex}" ctas_database = ctas_database if ctas_database else database if ctas_database is None: raise exceptions.InvalidArgumentCombination("Either ctas_database or database must be defined.") # Substitute execution_params with params if execution_params: if params: raise exceptions.InvalidArgumentCombination("`execution_params` and `params` are mutually exclusive.") params = execution_params paramstyle = "qmark" raise DeprecationWarning( '`execution_params` is being deprecated. Use `params` and `paramstyle="qmark"` instead.' ) # Substitute query parameters if applicable sql, execution_params = _apply_formatter(sql, params, paramstyle) fully_qualified_name = f'"{ctas_database}"."{ctas_table}"' wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) s3_output = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) s3_output = s3_output[:-1] if s3_output[-1] == "/" else s3_output # If the workgroup enforces an external location, then it overrides the user supplied argument external_location_str: str = ( f" external_location = '{s3_output}/{ctas_table}',\n" if (not wg_config.enforced) and (s3_output) else "" ) # At least one property must be specified within `WITH()` in the query. We default to `PARQUET` for `storage_format` storage_format_str: str = f""" format = '{storage_format.upper() if storage_format else "PARQUET"}'""" write_compression_str: str = ( f" write_compression = '{write_compression.upper()}',\n" if write_compression else "" ) partitioning_str: str = f" partitioned_by = ARRAY{partitioning_info},\n" if partitioning_info else "" bucketing_str: str = ( f" bucketed_by = ARRAY{bucketing_info[0]},\n bucket_count = {bucketing_info[1]},\n" if bucketing_info else "" ) field_delimiter_str: str = f" field_delimiter = '{field_delimiter}',\n" if field_delimiter else "" schema_only_str: str = "\nWITH NO DATA" if schema_only else "" ctas_sql = ( f"CREATE TABLE {fully_qualified_name}\n" f"WITH(\n" f"{external_location_str}" f"{partitioning_str}" f"{bucketing_str}" f"{field_delimiter_str}" f"{write_compression_str}" f"{storage_format_str}" f")\n" f"AS {sql}" f"{schema_only_str}" ) _logger.debug("ctas sql: %s", ctas_sql) try: query_execution_id: str = _start_query_execution( sql=ctas_sql, wg_config=wg_config, database=database, data_source=data_source, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, execution_params=execution_params, ) except botocore.exceptions.ClientError as ex: error = ex.response["Error"] if error["Code"] == "InvalidRequestException" and "Exception parsing query" in error["Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) if error["Code"] == "InvalidRequestException" and "extraneous input" in error["Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) raise ex response: dict[str, str | _QueryMetadata] = {"ctas_database": ctas_database, "ctas_table": ctas_table} if wait: try: response["ctas_query_metadata"] = _get_query_metadata( query_execution_id=query_execution_id, boto3_session=boto3_session, categories=categories, metadata_cache_manager=_cache_manager, athena_query_wait_polling_delay=athena_query_wait_polling_delay, ) except exceptions.QueryFailed as ex: msg: str = str(ex) if "Column name" in msg and "specified more than once" in msg: raise exceptions.InvalidCtasApproachQuery( f"Please, define distinct names for your columns. Root error message: {msg}" ) if "Column name not specified" in msg: raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in msg: raise exceptions.InvalidArgumentValue( "Please, don't leave undefined columns types in your query. You can cast to ensure it. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex else: response["ctas_query_id"] = query_execution_id _logger.info("Created CTAS table %s", fully_qualified_name) return response @apply_configs @_utils.validate_distributed_kwargs( unsupported_kwargs=["boto3_session"], ) def show_create_table( table: str, database: str | None = None, s3_output: str | None = None, workgroup: str = "primary", encryption: str | None = None, kms_key: str | None = None, athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY, s3_additional_kwargs: dict[str, Any] | None = None, boto3_session: boto3.Session | None = None, ) -> str: """Generate the query that created it: 'SHOW CREATE TABLE table;'. Analyzes an existing table named table_name to generate the query that created it. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table Table name. database AWS Glue/Athena database name. s3_output AWS S3 path. workgroup Athena workgroup. Primary by default. encryption None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. athena_query_wait_polling_delay Interval in seconds for how often the function will check if the Athena query has completed. s3_additional_kwargs Forwarded to botocore requests. e.g. s3_additional_kwargs={'RequestPayer': 'requester'} boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- The query that created the table. Examples -------- >>> import awswrangler as wr >>> df_table = wr.athena.show_create_table(table='my_table', database='default') """ query = f"SHOW CREATE TABLE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" query_id = _executions.start_query_execution( sql=query, database=database, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) query_metadata: _QueryMetadata = _get_query_metadata( query_execution_id=query_id, athena_query_wait_polling_delay=athena_query_wait_polling_delay, boto3_session=boto3_session, ) raw_result = _fetch_txt_result( query_metadata=query_metadata, keep_files=True, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return cast(str, raw_result.createtab_stmt.str.strip().str.cat(sep=" ")) @apply_configs def generate_create_query( table: str, database: str | None = None, catalog_id: str | None = None, boto3_session: boto3.Session | None = None, ) -> str: """Generate the query that created a table(EXTERNAL_TABLE) or a view(VIRTUAL_TABLE). Analyzes an existing table named table_name to generate the query that created it. Parameters ---------- table Table name. database Database name. catalog_id The ID of the Data Catalog from which to retrieve Databases. If ``None`` is provided, the AWS account ID is used by default. boto3_session The default boto3 session will be used if **boto3_session** receive ``None``. Returns ------- The query that created the table or view. Examples -------- >>> import awswrangler as wr >>> view_create_query: str = wr.athena.generate_create_query(table='my_view', database='default') """ def parse_columns(columns_description: list["ColumnOutputTypeDef"]) -> str: columns_str: list[str] = [] for column in columns_description: column_str = f" `{column['Name']}` {column['Type']}" if "Comment" in column: column_str += f" COMMENT '{column['Comment']}'" columns_str.append(column_str) return ", \n".join(columns_str) def parse_properties(parameters: dict[str, str]) -> str: properties_str: list[str] = [] for key, value in parameters.items(): if key == "EXTERNAL": continue property_key_value = f" '{key}'='{value}'" properties_str.append(property_key_value) return ", \n".join(properties_str) client_glue = _utils.client(service_name="glue", session=boto3_session) table_detail = client_glue.get_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table))[ "Table" ] if table_detail["TableType"] == "VIRTUAL_VIEW": glue_base64_query: str = table_detail["ViewOriginalText"].replace("/* Presto View: ", "").replace(" */", "") glue_query: str = json.loads(base64.b64decode(glue_base64_query))["originalSql"] return f"""CREATE OR REPLACE VIEW "{table}" AS \n{glue_query}""" if table_detail["TableType"] == "EXTERNAL_TABLE": columns: str = parse_columns(columns_description=table_detail["StorageDescriptor"]["Columns"]) query_parts: list[str] = [f"""CREATE EXTERNAL TABLE `{table}`(\n{columns})"""] partitioned_columns: str = parse_columns(columns_description=table_detail["PartitionKeys"]) if partitioned_columns: query_parts.append(f"""PARTITIONED BY ( \n{partitioned_columns})""") tblproperties: str = parse_properties(parameters=table_detail["Parameters"]) query_parts += [ """ROW FORMAT SERDE """, f""" '{table_detail["StorageDescriptor"]["SerdeInfo"]["SerializationLibrary"]}' """, """STORED AS INPUTFORMAT """, f""" '{table_detail["StorageDescriptor"]["InputFormat"]}' """, """OUTPUTFORMAT """, f""" '{table_detail["StorageDescriptor"]["OutputFormat"]}'""", """LOCATION""", f""" '{table_detail["StorageDescriptor"]["Location"]}'""", f"""TBLPROPERTIES (\n{tblproperties})""", ] sql = "\n".join(query_parts) _logger.debug("Generated create query:\n%s", sql) return sql raise NotImplementedError() def get_work_group(workgroup: str, boto3_session: boto3.Session | None = None) -> dict[str, Any]: """Return information about the workgroup with the specified name. Parameters ---------- workgroup Work Group name. boto3_session The default boto3 session will be used if **boto3_session** is ``None``. Returns ------- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.get_work_group Examples -------- >>> import awswrangler as wr >>> res = wr.athena.get_work_group(workgroup='workgroup_name') """ client_athena = _utils.client(service_name="athena", session=boto3_session) return cast( Dict[str, Any], _utils.try_it( f=client_athena.get_work_group, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=5, WorkGroup=workgroup, ), ) def get_query_executions( query_execution_ids: list[str], return_unprocessed: bool = False, boto3_session: boto3.Session | None = None ) -> tuple[pd.DataFrame, pd.DataFrame] | pd.DataFrame: """From specified query execution IDs, return a DataFrame of query execution details. https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.batch_get_query_execution Parameters ---------- query_execution_ids Athena query execution IDs. return_unprocessed True to also return query executions id that are unable to be processed. False to only return DataFrame of query execution details. Default is False boto3_session The default boto3 session will be used if **boto3_session** is ``None``. Returns ------- DataFrame containing either information about query execution details. Optionally, another DataFrame containing unprocessed query execution IDs. Examples -------- >>> import awswrangler as wr >>> query_executions_df, unprocessed_query_executions_df = wr.athena.get_query_executions( >>> query_execution_ids=['query-execution-id','query-execution-id1'] >>> ) """ chunked_size: int = 50 query_executions = [] unprocessed_query_execution = [] client_athena = _utils.client(service_name="athena", session=boto3_session) for i in range(0, len(query_execution_ids), chunked_size): response = client_athena.batch_get_query_execution(QueryExecutionIds=query_execution_ids[i : i + chunked_size]) query_executions += response["QueryExecutions"] unprocessed_query_execution += response["UnprocessedQueryExecutionIds"] if unprocessed_query_execution and not return_unprocessed: _logger.warning( "Some of query execution ids are unable to be processed." "Set return_unprocessed to True to get unprocessed query execution ids" ) if return_unprocessed: return pd.json_normalize(query_executions), pd.json_normalize(unprocessed_query_execution) return pd.json_normalize(query_executions) def list_query_executions( workgroup: str | None = None, max_results: int | None = None, boto3_session: boto3.Session | None = None, ) -> list[str]: """Fetch list query execution IDs ran in specified workgroup or primary work group if not specified. https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.list_query_executions Parameters ---------- workgroup The name of the workgroup from which the query_id are being returned. If not specified, a list of available query execution IDs for the queries in the primary workgroup is returned. max_results The maximum number of query execution IDs to return in this request. If not present, all execution IDs will be returned. boto3_session The default boto3 session will be used if **boto3_session** is ``None``. Returns ------- List of query execution IDs. Examples -------- >>> import awswrangler as wr >>> res = wr.athena.list_query_executions(workgroup='workgroup-name') """ client_athena = _utils.client(service_name="athena", session=boto3_session) kwargs: dict[str, Any] = {} if workgroup: kwargs["WorkGroup"] = workgroup if max_results is not None: kwargs["MaxResults"] = min(max_results, 50) query_list: list[str] = [] response = _utils.try_it( f=client_athena.list_query_executions, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=5, **kwargs, ) query_list += response["QueryExecutionIds"] while "NextToken" in response: kwargs["NextToken"] = response["NextToken"] if max_results is not None: if len(query_list) >= max_results: break kwargs["MaxResults"] = min(max_results - len(query_list), 50) response = _utils.try_it( f=client_athena.list_query_executions, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=5, **kwargs, ) query_list += response["QueryExecutionIds"] _logger.debug("Running %d query executions", len(query_list)) return query_list

awswrangler/athena/_utils.py (693 lines of code) (raw):