python/hudi/_internal.pyi (127 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pyarrow # type: ignore
__version__: str
@dataclass(init=False)
class HudiFileGroupReader:
"""
The reader that handles all read operations against a file group.
Attributes:
base_uri (str): The base URI of the file group's residing table.
options (Optional[Dict[str, str]]): Additional options for the reader.
"""
def __init__(self, base_uri: str, options: Optional[Dict[str, str]] = None):
"""
Initializes the HudiFileGroupReader.
Parameters:
base_uri (str): The base URI of the Hudi table.
options (Optional[Dict[str, str]]): Additional configuration options (optional).
"""
...
def read_file_slice_by_base_file_path(
self, relative_path: str
) -> "pyarrow.RecordBatch":
"""
Reads the data from the base file at the given relative path.
Parameters:
relative_path (str): The relative path to the base file.
Returns:
pyarrow.RecordBatch: A record batch read from the base file.
"""
...
def read_file_slice(self, file_slice: HudiFileSlice) -> "pyarrow.RecordBatch":
"""
Reads the data from the given file slice.
Parameters:
file_slice (HudiFileSlice): The file slice to read from.
Returns:
pyarrow.RecordBatch: A record batch read from the file slice.
"""
...
@dataclass(init=False)
class HudiFileSlice:
"""
Represents a file slice in a Hudi table. A file slice includes information about the base file,
the partition it belongs to, and associated metadata.
Attributes:
file_id (str): The id of the file group this file slice belongs to.
partition_path (str): The path of the partition containing this file slice.
creation_instant_time (str): The creation instant time of this file slice.
base_file_name (str): The name of the base file.
base_file_size (int): The on-disk size of the base file in bytes.
base_file_byte_size (int): The in-memory size of the base file in bytes.
log_file_names (List[str]): The names of the ordered log files.
num_records (int): The number of records in the file slice.
"""
file_id: str
partition_path: str
creation_instant_time: str
base_file_name: str
base_file_size: int
base_file_byte_size: int
log_file_names: List[str]
num_records: int
def base_file_relative_path(self) -> str:
"""
Returns the relative path of the base file for this file slice.
Returns:
str: The relative path of the base file.
"""
...
def log_files_relative_paths(self) -> List[str]:
"""
Returns the relative paths of the log files for this file slice.
Returns:
List[str]: A list of relative paths of the log files.
"""
...
@dataclass(init=False)
class HudiInstant:
@property
def timestamp(self) -> str: ...
@property
def action(self) -> str: ...
@property
def state(self) -> str: ...
@property
def epoch_mills(self) -> int: ...
@dataclass(init=False)
class HudiTable:
"""
Represents a Hudi table and provides methods to interact with it.
Attributes:
base_uri (str): The base URI of the Hudi table.
options (Optional[Dict[str, str]]): Additional options for table operations.
"""
def __init__(
self,
base_uri: str,
options: Optional[Dict[str, str]] = None,
):
"""
Initializes the HudiTable.
Parameters:
base_uri (str): The base URI of the Hudi table.
options (Optional[Dict[str, str]]): Additional configuration options (optional).
"""
...
def hudi_options(self) -> Dict[str, str]:
"""
Get hudi options for table.
Returns:
Dict[str, str]: A dictionary of hudi options.
"""
...
def storage_options(self) -> Dict[str, str]:
"""
Get storage options set for table instance.
Returns:
Dict[str, str]: A dictionary of storage options.
"""
...
@property
def table_name(self) -> str:
"""
Get table name.
Returns:
str: The name of the table.
"""
...
@property
def table_type(self) -> str:
"""
Get table type.
Returns:
str: The type of the table.
"""
...
@property
def is_mor(self) -> str:
"""
Get whether the table is an MOR table.
Returns:
str: True if the table is a MOR table, False otherwise.
"""
...
@property
def timezone(self) -> str:
"""
Get timezone.
Returns:
str: The timezone of the table.
"""
...
def get_avro_schema(self) -> str:
"""
Returns the Avro schema of the Hudi table.
Returns:
str: The Avro schema of the table.
"""
...
def get_schema(self) -> "pyarrow.Schema":
"""
Returns the schema of the Hudi table.
Returns:
pyarrow.Schema: The schema of the table.
"""
...
def get_partition_schema(self) -> "pyarrow.Schema":
"""
Returns the partition schema of the Hudi table.
Returns:
pyarrow.Schema: The schema used for partitioning the table.
"""
...
def get_timeline(self) -> HudiTimeline:
"""
Returns the timeline of the Hudi table.
"""
...
def get_file_slices_splits(
self, n: int, filters: Optional[List[Tuple[str, str, str]]]
) -> List[List[HudiFileSlice]]:
"""
Retrieves all file slices in the Hudi table in 'n' splits, optionally filtered by given filters.
Parameters:
n (int): The number of parts to split the file slices into.
filters (Optional[List[Tuple[str, str, str]]]): Optional filters for selecting file slices.
Returns:
List[List[HudiFileSlice]]: A list of file slice groups, each group being a list of HudiFileSlice objects.
"""
...
def get_file_slices_splits_as_of(
self, n: int, timestamp: str, filters: Optional[List[Tuple[str, str, str]]]
) -> List[List[HudiFileSlice]]:
"""
Retrieves all file slices in the Hudi table as of a timestamp in 'n' splits, optionally filtered by given filters.
"""
...
def get_file_slices(
self, filters: Optional[List[Tuple[str, str, str]]]
) -> List[HudiFileSlice]:
"""
Retrieves all file slices in the Hudi table, optionally filtered by the provided filters.
Parameters:
filters (Optional[List[Tuple[str, str, str]]]): Optional filters for selecting file slices.
Returns:
List[HudiFileSlice]: A list of file slices matching the filters.
"""
...
def get_file_slices_as_of(
self, timestamp: str, filters: Optional[List[Tuple[str, str, str]]]
) -> List[HudiFileSlice]:
"""
Retrieves all file slices in the Hudi table as of a timestamp, optionally filtered by the provided filters.
"""
...
def get_file_slices_between(
self,
start_timestamp: Optional[str],
end_timestamp: Optional[str],
) -> List[HudiFileSlice]:
"""
Retrieves all changed file slices in the Hudi table between the given timestamps.
"""
...
def create_file_group_reader_with_options(
self, options: Optional[Dict[str, str]] = None
) -> HudiFileGroupReader:
"""
Creates a HudiFileGroupReader for reading records from file groups in the Hudi table.
Returns:
HudiFileGroupReader: A reader object for reading file groups.
"""
...
def read_snapshot(
self, filters: Optional[List[Tuple[str, str, str]]]
) -> List["pyarrow.RecordBatch"]:
"""
Reads the latest snapshot of the Hudi table, optionally filtered by the provided filters.
Parameters:
filters (Optional[List[Tuple[str, str, str]]]): Optional filters for selecting file slices.
Returns:
List[pyarrow.RecordBatch]: A list of record batches from the snapshot of the table.
"""
...
def read_snapshot_as_of(
self, timestamp: str, filters: Optional[List[Tuple[str, str, str]]]
) -> List["pyarrow.RecordBatch"]:
"""
Reads the snapshot of the Hudi table as of a timestamp, optionally filtered by the provided filters.
"""
...
def read_incremental_records(
self, start_timestamp: str, end_timestamp: Optional[str]
) -> List["pyarrow.RecordBatch"]:
"""
Reads incremental records from the Hudi table between the given timestamps.
Parameters:
start_timestamp (str): The start timestamp (exclusive).
end_timestamp (Optional[str]): The end timestamp (inclusive).
Returns:
List[pyarrow.RecordBatch]: A list of record batches containing incremental records.
"""
...
@dataclass(init=False)
class HudiTimeline:
def get_completed_commits(self, desc: bool = False) -> List[HudiInstant]: ...
def get_completed_deltacommits(self, desc: bool = False) -> List[HudiInstant]: ...
def get_completed_replacecommits(self, desc: bool = False) -> List[HudiInstant]: ...
def get_completed_clustering_commits(
self, desc: bool = False
) -> List[HudiInstant]: ...
def get_instant_metadata_in_json(self, instant: HudiInstant) -> str: ...
def get_latest_commit_timestamp(self) -> str: ...
def get_latest_avro_schema(self) -> str: ...
def get_latest_schema(self) -> "pyarrow.Schema": ...
def build_hudi_table(
base_uri: str,
hudi_options: Optional[Dict[str, str]] = None,
storage_options: Optional[Dict[str, str]] = None,
options: Optional[Dict[str, str]] = None,
) -> HudiTable:
"""
Builds hudi table from base_uri and options.
Parameters:
base_uri (str): location of a hudi table.
hudi_options (Optional[Dict[str, str]]): hudi options.
storage_options (Optional[Dict[str, str]]): storage_options.
options (Optional[Dict[str, str]]): hudi or storage options.
Returns:
HudiTable: An instance of hudi table.
"""
...