awswrangler/_utils.py

"""Internal (private) Utilities Module.""" from __future__ import annotations import importlib import inspect import itertools import logging import math import os import random import time from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import partial, wraps from types import ModuleType from typing import ( TYPE_CHECKING, Any, Callable, Dict, Generator, Optional, Sequence, TypeVar, cast, overload, ) import boto3 import botocore.credentials import numpy as np import numpy.core.numeric as _nx import pyarrow as pa from botocore.config import Config from packaging import version import awswrangler.pandas as pd from awswrangler import _config, exceptions from awswrangler.__metadata__ import __version__ from awswrangler._arrow import _table_to_df from awswrangler._config import _insert_str, apply_configs from awswrangler._distributed import EngineEnum, engine if TYPE_CHECKING: from boto3.resources.base import ServiceResource from botocore.client import BaseClient from mypy_boto3_athena import AthenaClient from mypy_boto3_cleanrooms import CleanRoomsServiceClient from mypy_boto3_dynamodb import DynamoDBClient, DynamoDBServiceResource from mypy_boto3_ec2 import EC2Client from mypy_boto3_emr.client import EMRClient from mypy_boto3_emr_serverless import EMRServerlessClient from mypy_boto3_glue import GlueClient from mypy_boto3_kms.client import KMSClient from mypy_boto3_logs.client import CloudWatchLogsClient from mypy_boto3_opensearch.client import OpenSearchServiceClient from mypy_boto3_opensearchserverless.client import OpenSearchServiceServerlessClient from mypy_boto3_quicksight.client import QuickSightClient from mypy_boto3_rds_data.client import RDSDataServiceClient from mypy_boto3_redshift.client import RedshiftClient from mypy_boto3_redshift_data.client import RedshiftDataAPIServiceClient from mypy_boto3_s3 import S3Client, S3ServiceResource from mypy_boto3_secretsmanager import SecretsManagerClient from mypy_boto3_sts.client import STSClient from mypy_boto3_timestream_query.client import TimestreamQueryClient from mypy_boto3_timestream_write.client import TimestreamWriteClient from typing_extensions import Literal ServiceName = Literal[ "athena", "cleanrooms", "dynamodb", "ec2", "emr", "emr-serverless", "glue", "kms", "logs", "opensearch", "opensearchserverless", "quicksight", "rds-data", "redshift-data", "redshift", "s3", "secretsmanager", "sts", "timestream-query", "timestream-write", ] _logger: logging.Logger = logging.getLogger(__name__) Boto3PrimitivesType = Dict[str, Optional[str]] FunctionType = TypeVar("FunctionType", bound=Callable[..., Any]) # A mapping from import name to package name (on PyPI) for packages where # these two names are different. # Also needs to map the extras name as found in pyproject.toml # for example, pip install awswrangler[oracle] installs oracledb INSTALL_MAPPING = { "redshift_connector": "redshift", "pymysql": "mysql", "pg8000": "postgres", "pyodbc": "sqlserver", "gremlin_python": "gremlin", "opensearchpy": "opensearch", "jsonpath_ng": "opensearch", "oracledb": "oracle", } def check_optional_dependency( module: ModuleType | None, name: str, ) -> Callable[[FunctionType], FunctionType]: def decorator(func: FunctionType) -> FunctionType: @wraps(func) def inner(*args: Any, **kwargs: Any) -> Any: if not module: package_name = INSTALL_MAPPING.get(name) install_name = package_name if package_name is not None else name raise ModuleNotFoundError( f"Missing optional dependency '{name}'. Use pip install awswrangler[{install_name}] to install it." ) return func(*args, **kwargs) return cast(FunctionType, inner) return decorator def import_optional_dependency(name: str) -> ModuleType: """Import an optional dependency. Parameters ---------- name : str The module name. Returns ------- maybe_module : Optional[ModuleType] The imported module, when found. """ try: module = importlib.import_module(name) except ImportError: return None # type: ignore[return-value] return module def validate_kwargs( condition_fn: Callable[..., bool] = lambda _: True, unsupported_kwargs: list[str] | None = None, message: str = "Arguments not supported:", ) -> Callable[[FunctionType], FunctionType]: unsupported_kwargs = unsupported_kwargs if unsupported_kwargs else [] def decorator(func: FunctionType) -> FunctionType: signature = inspect.signature(func) @wraps(func) def inner(*args: Any, **kwargs: Any) -> Any: passed_unsupported_kwargs = set(unsupported_kwargs).intersection( set([key for key, value in kwargs.items() if value is not None]) ) # Allow kwargs that didn't modify the default value passed_unsupported_kwargs = { key for key in passed_unsupported_kwargs if kwargs[key] != signature.parameters[key].default } if condition_fn() and len(passed_unsupported_kwargs) > 0: raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.") return func(*args, **kwargs) inner.__doc__ = _inject_kwargs_validation_doc( doc=func.__doc__, unsupported_kwargs=unsupported_kwargs, message=message, ) inner.__name__ = func.__name__ inner.__setattr__("__signature__", signature) return cast(FunctionType, inner) return decorator def _inject_kwargs_validation_doc( doc: str | None, unsupported_kwargs: list[str] | None, message: str, ) -> str | None: if not doc or "\n Parameters" not in doc or not unsupported_kwargs: return doc header: str = f"\n\n Note\n ----\n {message}\n\n" kwargs_block: str = "\n".join(tuple(f" - {x}\n" for x in unsupported_kwargs)) insertion: str = header + kwargs_block + "\n\n" return _insert_str(text=doc, token="\n Parameters", insert=insertion) validate_distributed_kwargs = partial( validate_kwargs, condition_fn=lambda: engine.get() == EngineEnum.RAY, message=f"Following arguments are not supported in distributed mode with engine `{EngineEnum.RAY}`:", ) def ensure_session(session: None | boto3.Session = None) -> boto3.Session: """Ensure that a valid boto3.Session will be returned.""" if session is not None: return session # Ensure the boto3's default session is used so that its parameters can be # set via boto3.setup_default_session() if boto3.DEFAULT_SESSION is not None: return boto3.DEFAULT_SESSION return boto3.Session() def boto3_to_primitives(boto3_session: boto3.Session | None = None) -> Boto3PrimitivesType: """Convert Boto3 Session to Python primitives.""" _boto3_session: boto3.Session = ensure_session(session=boto3_session) credentials = _boto3_session.get_credentials() return { "aws_access_key_id": getattr(credentials, "access_key", None), "aws_secret_access_key": getattr(credentials, "secret_key", None), "aws_session_token": getattr(credentials, "token", None), "region_name": _boto3_session.region_name, "profile_name": _boto3_session.profile_name, } def default_botocore_config() -> botocore.config.Config: """Botocore configuration.""" retries_config: dict[str, str | int] = { "max_attempts": int(os.getenv("AWS_MAX_ATTEMPTS", "5")), } mode = os.getenv("AWS_RETRY_MODE") if mode: retries_config["mode"] = mode return Config( retries=retries_config, # type: ignore[arg-type] connect_timeout=10, max_pool_connections=10, user_agent_extra=f"awswrangler/{__version__}", ) def _get_endpoint_url(service_name: str) -> str | None: endpoint_url: str | None = None if service_name == "s3" and _config.config.s3_endpoint_url is not None: endpoint_url = _config.config.s3_endpoint_url elif service_name == "athena" and _config.config.athena_endpoint_url is not None: endpoint_url = _config.config.athena_endpoint_url elif service_name == "sts" and _config.config.sts_endpoint_url is not None: endpoint_url = _config.config.sts_endpoint_url elif service_name == "glue" and _config.config.glue_endpoint_url is not None: endpoint_url = _config.config.glue_endpoint_url elif service_name == "redshift" and _config.config.redshift_endpoint_url is not None: endpoint_url = _config.config.redshift_endpoint_url elif service_name == "kms" and _config.config.kms_endpoint_url is not None: endpoint_url = _config.config.kms_endpoint_url elif service_name == "emr" and _config.config.emr_endpoint_url is not None: endpoint_url = _config.config.emr_endpoint_url elif service_name == "dynamodb" and _config.config.dynamodb_endpoint_url is not None: endpoint_url = _config.config.dynamodb_endpoint_url elif service_name == "secretsmanager" and _config.config.secretsmanager_endpoint_url is not None: endpoint_url = _config.config.secretsmanager_endpoint_url elif service_name == "timestream-write" and _config.config.timestream_write_endpoint_url is not None: endpoint_url = _config.config.timestream_write_endpoint_url elif service_name == "timestream-query" and _config.config.timestream_query_endpoint_url is not None: endpoint_url = _config.config.timestream_query_endpoint_url return endpoint_url @overload def client( service_name: 'Literal["athena"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "AthenaClient": ... @overload def client( service_name: 'Literal["cleanrooms"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "CleanRoomsServiceClient": ... @overload def client( service_name: 'Literal["logs"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "CloudWatchLogsClient": ... @overload def client( service_name: 'Literal["dynamodb"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "DynamoDBClient": ... @overload def client( service_name: 'Literal["ec2"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "EC2Client": ... @overload def client( service_name: 'Literal["emr"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "EMRClient": ... @overload def client( service_name: 'Literal["emr-serverless"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "EMRServerlessClient": ... @overload def client( service_name: 'Literal["glue"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "GlueClient": ... @overload def client( service_name: 'Literal["kms"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "KMSClient": ... @overload def client( service_name: 'Literal["opensearch"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "OpenSearchServiceClient": ... @overload def client( service_name: 'Literal["opensearchserverless"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "OpenSearchServiceServerlessClient": ... @overload def client( service_name: 'Literal["quicksight"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "QuickSightClient": ... @overload def client( service_name: 'Literal["rds-data"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "RDSDataServiceClient": ... @overload def client( service_name: 'Literal["redshift"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "RedshiftClient": ... @overload def client( service_name: 'Literal["redshift-data"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "RedshiftDataAPIServiceClient": ... @overload def client( service_name: 'Literal["s3"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "S3Client": ... @overload def client( service_name: 'Literal["secretsmanager"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "SecretsManagerClient": ... @overload def client( service_name: 'Literal["sts"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "STSClient": ... @overload def client( service_name: 'Literal["timestream-query"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "TimestreamQueryClient": ... @overload def client( service_name: 'Literal["timestream-write"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "TimestreamWriteClient": ... @overload def client( service_name: "ServiceName", session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "BaseClient": ... @apply_configs def client( service_name: "ServiceName", session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "BaseClient": """Create a valid boto3.client.""" endpoint_url: str | None = _get_endpoint_url(service_name=service_name) return ensure_session(session=session).client( service_name=service_name, endpoint_url=endpoint_url, use_ssl=True, config=botocore_config or default_botocore_config(), verify=verify or _config.config.verify, ) @overload def resource( service_name: 'Literal["dynamodb"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "DynamoDBServiceResource": ... @overload def resource( service_name: 'Literal["s3"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "S3ServiceResource": ... @apply_configs def resource( service_name: 'Literal["dynamodb"]' | 'Literal["s3"]', session: boto3.Session | None = None, botocore_config: Config | None = None, verify: str | bool | None = None, ) -> "ServiceResource": """Create a valid boto3.resource.""" endpoint_url: str | None = _get_endpoint_url(service_name=service_name) return ensure_session(session=session).resource( service_name=service_name, endpoint_url=endpoint_url, use_ssl=True, verify=verify, config=default_botocore_config() if botocore_config is None else botocore_config, ) def parse_path(path: str) -> tuple[str, str]: """Split a full S3 path in bucket and key strings. 's3://bucket/key' -> ('bucket', 'key') Parameters ---------- path : str S3 path (e.g. s3://bucket/key). Returns ------- Tuple[str, str] Tuple of bucket and key strings Examples -------- >>> from awswrangler._utils import parse_path >>> bucket, key = parse_path('s3://bucket/key') >>> from awswrangler._utils import parse_path >>> bucket, key = parse_path('s3://arn:aws:s3:<awsregion>:<awsaccount>:accesspoint/<ap_name>/<key>') """ if path.startswith("s3://") is False: raise exceptions.InvalidArgumentValue(f"'{path}' is not a valid path. It MUST start with 's3://'") parts = path.replace("s3://", "").replace(":accesspoint/", ":accesspoint:").split("/", 1) bucket: str = parts[0] if "/" in bucket: raise exceptions.InvalidArgumentValue(f"'{bucket}' is not a valid bucket name.") key: str = "" if len(parts) == 2: key = key if parts[1] is None else parts[1] return bucket, key def ensure_cpu_count(use_threads: bool | int = True) -> int: """Get the number of cpu cores to be used. Note ---- In case of `use_threads=True` the number of threads that could be spawned will be get from os.cpu_count(). Parameters ---------- use_threads : Union[bool, int] True to enable multi-core utilization, False to disable. If given an int will simply return the input value. Returns ------- int Number of cpu cores to be used. Examples -------- >>> from awswrangler._utils import ensure_cpu_count >>> ensure_cpu_count(use_threads=True) 4 >>> ensure_cpu_count(use_threads=False) 1 """ if type(use_threads) == int: # noqa: E721 if use_threads < 1: return 1 return use_threads cpus: int = 1 if use_threads is True: cpu_cnt: int | None = os.cpu_count() if cpu_cnt is not None: cpus = cpu_cnt if cpu_cnt > cpus else cpus return cpus @engine.dispatch_on_engine def ensure_worker_or_thread_count(use_threads: bool | int = True) -> int: """Get the number of CPU cores or Ray workers to be used. Note ---- In case of `use_threads=True` the number of threads that could be spawned will be spawned from the OS or the Ray cluster configuration. Parameters ---------- use_threads : Union[bool, int] True to enable multi-core utilization, False to disable. If given an int will simply return the input value. Returns ------- int Number of workers of threads to be used. Examples -------- >>> from awswrangler._utils import ensure_worker_or_thread_count >>> ensure_worker_or_thread_count(use_threads=True) 4 >>> ensure_worker_or_thread_count(use_threads=False) 1 """ return ensure_cpu_count(use_threads=use_threads) ChunkifyItemType = TypeVar("ChunkifyItemType") def chunkify( lst: list[ChunkifyItemType], num_chunks: int = 1, max_length: int | None = None ) -> list[list[ChunkifyItemType]]: """Split a list in a List of List (chunks) with even sizes. Parameters ---------- lst: List List of anything to be split up. num_chunks: int, optional Maximum number of chunks. max_length: int, optional Max length of each chunk. Has priority over num_chunks. Returns ------- List[List[Any]] List of List (chunks) with even sizes. Examples -------- >>> from awswrangler._utils import chunkify >>> chunkify(list(range(13)), num_chunks=3) [[0, 1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] >>> chunkify(list(range(13)), max_length=4) [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]] """ if not lst: return [] n: int = num_chunks if max_length is None else int(math.ceil(float(len(lst)) / float(max_length))) np_chunks = np.array_split(lst, n) # type: ignore[arg-type,var-annotated] return [arr.tolist() for arr in np_chunks if len(arr) > 0] def empty_generator() -> Generator[None, None, None]: """Empty Generator.""" yield from () def get_directory(path: str) -> str: """Extract directory path.""" return path.rsplit(sep="/", maxsplit=1)[0] + "/" def get_region_from_subnet(subnet_id: str, boto3_session: boto3.Session | None = None) -> str: """Extract region from Subnet ID.""" session: boto3.Session = ensure_session(session=boto3_session) client_ec2 = client(service_name="ec2", session=session) return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:-1] def get_region_from_session(boto3_session: boto3.Session | None = None, default_region: str | None = None) -> str: """Extract region from session.""" session: boto3.Session = ensure_session(session=boto3_session) region: str | None = session.region_name if region is not None: return region if default_region is not None: return default_region raise exceptions.InvalidArgument("There is no region_name defined on boto3, please configure it.") def get_credentials_from_session( boto3_session: boto3.Session | None = None, ) -> botocore.credentials.ReadOnlyCredentials: """Get AWS credentials from boto3 session.""" session: boto3.Session = ensure_session(session=boto3_session) credentials: botocore.credentials.Credentials = session.get_credentials() # type: ignore[assignment] frozen_credentials: botocore.credentials.ReadOnlyCredentials = credentials.get_frozen_credentials() return frozen_credentials def list_sampling(lst: list[Any], sampling: float) -> list[Any]: """Random List sampling.""" if sampling == 1.0: return lst if sampling > 1.0 or sampling <= 0.0: raise exceptions.InvalidArgumentValue(f"Argument <sampling> must be [0.0 < value <= 1.0]. {sampling} received.") _len: int = len(lst) if _len == 0: return [] num_samples: int = int(round(_len * sampling)) num_samples = _len if num_samples > _len else num_samples num_samples = 1 if num_samples < 1 else num_samples _logger.debug("_len: %s", _len) _logger.debug("sampling: %s", sampling) _logger.debug("num_samples: %s", num_samples) random_lst: list[Any] = random.sample(population=lst, k=num_samples) random_lst.sort() return random_lst def check_duplicated_columns(df: pd.DataFrame) -> Any: """Raise an exception if there are duplicated columns names.""" duplicated: list[str] = df.loc[:, df.columns.duplicated()].columns.to_list() if duplicated: raise exceptions.InvalidDataFrame( f"There are duplicated column names in your DataFrame: {duplicated}. " f"Note that your columns may have been sanitized and it can be the cause of " f"the duplicity." ) def retry( ex: type[Exception], ex_code: str | None = None, base: float = 1.0, max_num_tries: int = 3, ) -> Callable[..., Any]: """ Decorate function with decorrelated Jitter retries. Parameters ---------- ex: Exception Exception to retry on ex_code: str | None Response error code base: float Base delay max_num_tries: int Maximum number of retries Returns ------- Callable[..., Any] Function """ def wrapper(f: Callable[..., Any]) -> Any: return wraps(f)(partial(try_it, f, ex, ex_code=ex_code, base=base, max_num_tries=max_num_tries)) return wrapper TryItOutputType = TypeVar("TryItOutputType") def try_it( f: Callable[..., TryItOutputType], ex: Any, *args: Any, ex_code: str | None = None, base: float = 1.0, max_num_tries: int = 3, **kwargs: Any, ) -> TryItOutputType: """Run function with decorrelated Jitter. Reference: https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ """ delay: float = base for i in range(max_num_tries): try: return f(*args, **kwargs) except ex as exception: if ex_code is not None and hasattr(exception, "response"): if exception.response["Error"]["Code"] != ex_code: raise if i == (max_num_tries - 1): raise delay = random.uniform(base, delay * 3) _logger.error("Retrying %s | Fail number %s/%s | Exception: %s", f, i + 1, max_num_tries, exception) time.sleep(delay) raise RuntimeError() def get_even_chunks_sizes(total_size: int, chunk_size: int, upper_bound: bool) -> tuple[int, ...]: """Calculate even chunks sizes (Best effort).""" round_func: Callable[[float], float] = math.ceil if upper_bound is True else math.floor num_chunks: int = int(round_func(float(total_size) / float(chunk_size))) num_chunks = 1 if num_chunks < 1 else num_chunks base_size: int = int(total_size / num_chunks) rest: int = total_size % num_chunks sizes: list[int] = list(itertools.repeat(base_size, num_chunks)) for i in range(rest): i_cycled: int = i % len(sizes) sizes[i_cycled] += 1 return tuple(sizes) def get_running_futures(seq: Sequence[Future]) -> tuple[Future, ...]: # type: ignore[type-arg] """Filter only running futures.""" return tuple(f for f in seq if f.running()) def wait_any_future_available(seq: Sequence[Future]) -> None: # type: ignore[type-arg] """Wait until any future became available.""" wait(fs=seq, timeout=None, return_when=FIRST_COMPLETED) def block_waiting_available_thread(seq: Sequence[Future], max_workers: int) -> None: # type: ignore[type-arg] """Block until any thread became available.""" running: tuple[Future, ...] = get_running_futures(seq=seq) # type: ignore[type-arg] while len(running) >= max_workers: wait_any_future_available(seq=running) running = get_running_futures(seq=running) def check_schema_changes(columns_types: dict[str, str], table_input: dict[str, Any] | None, mode: str) -> None: """Check schema changes.""" if (table_input is not None) and (mode in ("append", "overwrite_partitions")): catalog_cols: dict[str, str] = {x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"]} for c, t in columns_types.items(): if c not in catalog_cols: raise exceptions.InvalidArgumentValue( f"Schema change detected: New column {c} with type {t}. " "Please pass schema_evolution=True to allow new columns " "behaviour." ) if t != catalog_cols[c]: # Data type change detected! raise exceptions.InvalidArgumentValue( f"Schema change detected: Data type change on column {c} " f"(Old type: {catalog_cols[c]} / New type {t})." ) @engine.dispatch_on_engine def split_pandas_frame(df: pd.DataFrame, splits: int) -> list[pd.DataFrame]: """Split a DataFrame into n chunks.""" # Logic obtained from np.array_split definition total = len(df) each_section, extras = divmod(total, splits) section_sizes = [0] + extras * [each_section + 1] + (splits - extras) * [each_section] div_points = _nx.array(section_sizes, dtype=_nx.intp).cumsum() sub_dfs = [] for i in range(splits): sub_df = df.iloc[div_points[i] : div_points[i + 1]] if not sub_df.empty: sub_dfs.append(sub_df) return sub_dfs @engine.dispatch_on_engine def table_refs_to_df(tables: list[pa.Table], kwargs: dict[str, Any]) -> pd.DataFrame: """Build Pandas DataFrame from list of PyArrow tables.""" promote_kwargs: dict[str, bool | str] = {"promote": True} if version.parse(pa.__version__) >= version.parse("14.0.0"): promote_kwargs = {"promote_options": "default"} return _table_to_df(pa.concat_tables(tables, **promote_kwargs), kwargs=kwargs) @engine.dispatch_on_engine def is_pandas_frame(obj: Any) -> bool: """Check if the passed objected is a Pandas DataFrame.""" return isinstance(obj, pd.DataFrame) @engine.dispatch_on_engine def copy_df_shallow(df: pd.DataFrame) -> pd.DataFrame: """Create a shallow copy of the Pandas DataFrame.""" return df.copy(deep=False) def list_to_arrow_table( mapping: list[dict[str, Any]], schema: pa.Schema | None = None, metadata: dict[str, Any] | None = None, ) -> pa.Table: """Construct a PyArrow Table from list of dictionaries.""" arrays = [] if not schema: names = [] if mapping: names = list(mapping[0].keys()) for row in mapping: for k in row.keys(): if k not in names: names.append(k) for n in names: v = [row[n] if n in row else None for row in mapping] arrays.append(v) return pa.Table.from_arrays(arrays, names, metadata=metadata) for n in schema.names: v = [row[n] if n in row else None for row in mapping] arrays.append(v) # Will raise if metadata is not None return pa.Table.from_arrays(arrays, schema=schema, metadata=metadata)

awswrangler/_utils.py (603 lines of code) (raw):