def rewrite_to_parquet()

in de/fileutils.py [0:0]


def rewrite_to_parquet(src_path, dest_path, block_size=1024 * 1024, **kwargs):
    """
    Reads a Parquet file in blocks and writes them out to another file.

    :param src_path: Path to the source Parquet file.
    :param dest_path: Path to the destination Parquet file.
    :param block_size: Size of the blocks to read and write in bytes.
    """
    src_path = Path(src_path)
    dest_path = Path(dest_path)

    with pq.ParquetFile(src_path) as src:
        schema = src.schema.to_arrow_schema()
        writer = pq.ParquetWriter(dest_path, schema, **kwargs)
        for batch in src.iter_batches(batch_size=block_size):
            writer.write(batch, row_group_size=1024 * 1024)
        writer.close()

    src = pq.ParquetFile(src_path)
    dst = pq.ParquetFile(dest_path)
    src_metadata = src.metadata
    dst_metadata = dst.metadata

    assert src_metadata.num_rows == dst_metadata.num_rows
    assert (
        src_metadata.schema.to_arrow_schema() == dst_metadata.schema.to_arrow_schema()
    )