def parquet_file_to_data_file()

in pyiceberg/io/pyarrow.py [0:0]


def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_path: str) -> DataFile:
    input_file = io.new_input(file_path)
    with input_file.open() as input_stream:
        parquet_metadata = pq.read_metadata(input_stream)

    arrow_schema = parquet_metadata.schema.to_arrow_schema()
    if visit_pyarrow(arrow_schema, _HasIds()):
        raise NotImplementedError(
            f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids"
        )

    schema = table_metadata.schema()
    _check_pyarrow_schema_compatible(schema, arrow_schema)

    statistics = data_file_statistics_from_parquet_metadata(
        parquet_metadata=parquet_metadata,
        stats_columns=compute_statistics_plan(schema, table_metadata.properties),
        parquet_column_mapping=parquet_path_to_id_mapping(schema),
    )
    data_file = DataFile.from_args(
        content=DataFileContent.DATA,
        file_path=file_path,
        file_format=FileFormat.PARQUET,
        partition=statistics.partition(table_metadata.spec(), table_metadata.schema()),
        file_size_in_bytes=len(input_file),
        sort_order_id=None,
        spec_id=table_metadata.default_spec_id,
        equality_ids=None,
        key_metadata=None,
        **statistics.to_serialized_dict(),
    )

    return data_file