def parquet_files_to_data_files()

in pyiceberg/io/pyarrow.py [0:0]


def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_paths: Iterator[str]) -> Iterator[DataFile]:
    for file_path in file_paths:
        input_file = io.new_input(file_path)
        with input_file.open() as input_stream:
            parquet_metadata = pq.read_metadata(input_stream)

        if visit_pyarrow(parquet_metadata.schema.to_arrow_schema(), _HasIds()):
            raise NotImplementedError(
                f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids"
            )
        schema = table_metadata.schema()
        _check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())

        statistics = data_file_statistics_from_parquet_metadata(
            parquet_metadata=parquet_metadata,
            stats_columns=compute_statistics_plan(schema, table_metadata.properties),
            parquet_column_mapping=parquet_path_to_id_mapping(schema),
        )
        data_file = DataFile(
            content=DataFileContent.DATA,
            file_path=file_path,
            file_format=FileFormat.PARQUET,
            partition=statistics.partition(table_metadata.spec(), table_metadata.schema()),
            file_size_in_bytes=len(input_file),
            sort_order_id=None,
            spec_id=table_metadata.default_spec_id,
            equality_ids=None,
            key_metadata=None,
            **statistics.to_serialized_dict(),
        )

        yield data_file