def get_page_chunk_sizes()

in de/fileutils.py [0:0]


def get_page_chunk_sizes(paths):
    # get the result of parquet-layout command
    for path in paths:
        output = subprocess.check_output(["parquet-layout", path], text=True)
        meta = json.loads(output)
        for row_group in meta["row_groups"]:
            for column in row_group["columns"]:
                for page in column["pages"]:
                    if page["page_type"].startswith("data"):
                        yield page["uncompressed_bytes"], page["num_values"]