awswrangler/s3/_read

import datetime from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, overload import boto3 import pandas as pd import pyarrow as pa from awswrangler.typing import ArrowDecryptionConfiguration, RayReadParquetSettings if TYPE_CHECKING: from mypy_boto3_s3 import S3Client def _pyarrow_parquet_file_wrapper( source: Any, coerce_int96_timestamp_unit: str | None = ... ) -> pa.parquet.ParquetFile: ... def _read_parquet_metadata_file( s3_client: "S3Client" | None, path: str, s3_additional_kwargs: dict[str, str] | None, use_threads: bool | int, version_id: str | None = ..., coerce_int96_timestamp_unit: str | None = ..., ) -> pa.schema: ... def _read_parquet( paths: list[str], path_root: str | None, schema: pa.schema | None, columns: list[str] | None, coerce_int96_timestamp_unit: str | None, use_threads: bool | int, override_num_blocks: int, version_ids: dict[str, str] | None, s3_client: "S3Client" | None, s3_additional_kwargs: dict[str, Any] | None, arrow_kwargs: dict[str, Any], bulk_read: bool, ) -> pd.DataFrame | Iterator[pd.DataFrame]: ... def _read_parquet_metadata( path: str | list[str], path_suffix: str | None, path_ignore_suffix: str | list[str] | None, ignore_empty: bool, ignore_null: bool, dtype: dict[str, str] | None, sampling: float, dataset: bool, use_threads: bool | int, boto3_session: boto3.Session | None, s3_additional_kwargs: dict[str, str] | None, version_id: str | dict[str, str] | None = ..., coerce_int96_timestamp_unit: str | None = ..., ) -> tuple[dict[str, str], dict[str, str] | None, dict[str, list[str]] | None]: ... def read_parquet_metadata( path: str | list[str], dataset: bool = ..., version_id: str | dict[str, str] | None = ..., path_suffix: str | None = ..., path_ignore_suffix: str | list[str] | None = ..., ignore_empty: bool = ..., ignore_null: bool = ..., dtype: dict[str, str] | None = ..., sampling: float = ..., coerce_int96_timestamp_unit: str | None = ..., use_threads: bool | int = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., ) -> tuple[dict[str, str], dict[str, str] | None]: ... @overload def read_parquet( path: str | list[str], path_root: str | None = ..., dataset: bool = ..., path_suffix: str | list[str] | None = ..., path_ignore_suffix: str | list[str] | None = ..., ignore_empty: bool = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., schema: pa.Schema | None = ..., last_modified_begin: datetime.datetime | None = ..., last_modified_end: datetime.datetime | None = ..., version_id: str | dict[str, str] | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: Literal[False] = ..., use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> pd.DataFrame: ... @overload def read_parquet( path: str | list[str], *, path_root: str | None = ..., dataset: bool = ..., path_suffix: str | list[str] | None = ..., path_ignore_suffix: str | list[str] | None = ..., ignore_empty: bool = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., schema: pa.Schema | None = ..., last_modified_begin: datetime.datetime | None = ..., last_modified_end: datetime.datetime | None = ..., version_id: str | dict[str, str] | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: Literal[True], use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> Iterator[pd.DataFrame]: ... @overload def read_parquet( path: str | list[str], *, path_root: str | None = ..., dataset: bool = ..., path_suffix: str | list[str] | None = ..., path_ignore_suffix: str | list[str] | None = ..., ignore_empty: bool = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., schema: pa.Schema | None = ..., last_modified_begin: datetime.datetime | None = ..., last_modified_end: datetime.datetime | None = ..., version_id: str | dict[str, str] | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: bool, use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> pd.DataFrame | Iterator[pd.DataFrame]: ... @overload def read_parquet( path: str | list[str], *, path_root: str | None = ..., dataset: bool = ..., path_suffix: str | list[str] | None = ..., path_ignore_suffix: str | list[str] | None = ..., ignore_empty: bool = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., schema: pa.Schema | None = ..., last_modified_begin: datetime.datetime | None = ..., last_modified_end: datetime.datetime | None = ..., version_id: str | dict[str, str] | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: int, use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> Iterator[pd.DataFrame]: ... @overload def read_parquet_table( table: str, database: str, *, filename_suffix: str | list[str] | None = ..., filename_ignore_suffix: str | list[str] | None = ..., catalog_id: str | None = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: Literal[False] = ..., use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> pd.DataFrame: ... @overload def read_parquet_table( table: str, database: str, *, filename_suffix: str | list[str] | None = ..., filename_ignore_suffix: str | list[str] | None = ..., catalog_id: str | None = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: Literal[True], use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> Iterator[pd.DataFrame]: ... @overload def read_parquet_table( table: str, database: str, *, filename_suffix: str | list[str] | None = ..., filename_ignore_suffix: str | list[str] | None = ..., catalog_id: str | None = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: bool, use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> pd.DataFrame | Iterator[pd.DataFrame]: ... @overload def read_parquet_table( table: str, database: str, *, filename_suffix: str | list[str] | None = ..., filename_ignore_suffix: str | list[str] | None = ..., catalog_id: str | None = ..., partition_filter: Callable[[dict[str, str]], bool] | None = ..., columns: list[str] | None = ..., validate_schema: bool = ..., coerce_int96_timestamp_unit: str | None = ..., dtype_backend: Literal["numpy_nullable", "pyarrow"] = ..., chunked: int, use_threads: bool | int = ..., ray_args: RayReadParquetSettings | None = ..., boto3_session: boto3.Session | None = ..., s3_additional_kwargs: dict[str, Any] | None = ..., pyarrow_additional_kwargs: dict[str, Any] | None = ..., decryption_configuration: ArrowDecryptionConfiguration | None = ..., ) -> Iterator[pd.DataFrame]: ...

awswrangler/s3/_read_parquet.pyi (250 lines of code) (raw):