in de/fileutils.py [0:0]
def rewrite_to_parquet(src_path, dest_path, block_size=1024 * 1024, **kwargs):
"""
Reads a Parquet file in blocks and writes them out to another file.
:param src_path: Path to the source Parquet file.
:param dest_path: Path to the destination Parquet file.
:param block_size: Size of the blocks to read and write in bytes.
"""
src_path = Path(src_path)
dest_path = Path(dest_path)
with pq.ParquetFile(src_path) as src:
schema = src.schema.to_arrow_schema()
writer = pq.ParquetWriter(dest_path, schema, **kwargs)
for batch in src.iter_batches(batch_size=block_size):
writer.write(batch, row_group_size=1024 * 1024)
writer.close()
src = pq.ParquetFile(src_path)
dst = pq.ParquetFile(dest_path)
src_metadata = src.metadata
dst_metadata = dst.metadata
assert src_metadata.num_rows == dst_metadata.num_rows
assert (
src_metadata.schema.to_arrow_schema() == dst_metadata.schema.to_arrow_schema()
)