pyiceberg/table/puffin.py (67 lines of code) (raw):

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import math from typing import TYPE_CHECKING, Dict, List, Literal, Optional from pydantic import Field from pyroaring import BitMap, FrozenBitMap from pyiceberg.typedef import IcebergBaseModel if TYPE_CHECKING: import pyarrow as pa # Short for: Puffin Fratercula arctica, version 1 MAGIC_BYTES = b"PFA1" EMPTY_BITMAP = FrozenBitMap() MAX_JAVA_SIGNED = int(math.pow(2, 31)) - 1 PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file" def _deserialize_bitmap(pl: bytes) -> List[BitMap]: number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little") pl = pl[8:] bitmaps = [] last_key = -1 for _ in range(number_of_bitmaps): key = int.from_bytes(pl[0:4], byteorder="little") if key < 0: raise ValueError(f"Invalid unsigned key: {key}") if key <= last_key: raise ValueError("Keys must be sorted in ascending order") if key > MAX_JAVA_SIGNED: raise ValueError(f"Key {key} is too large, max {MAX_JAVA_SIGNED} to maintain compatibility with Java impl") pl = pl[4:] while last_key < key - 1: bitmaps.append(EMPTY_BITMAP) last_key += 1 bm = BitMap().deserialize(pl) # TODO: Optimize this pl = pl[len(bm.serialize()) :] bitmaps.append(bm) last_key = key return bitmaps class PuffinBlobMetadata(IcebergBaseModel): type: Literal["deletion-vector-v1"] = Field() fields: List[int] = Field() snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") offset: int = Field() length: int = Field() compression_codec: Optional[str] = Field(alias="compression-codec", default=None) properties: Dict[str, str] = Field(default_factory=dict) class Footer(IcebergBaseModel): blobs: List[PuffinBlobMetadata] = Field() properties: Dict[str, str] = Field(default_factory=dict) def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": import pyarrow as pa return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps)) class PuffinFile: footer: Footer _deletion_vectors: Dict[str, List[BitMap]] def __init__(self, puffin: bytes) -> None: for magic_bytes in [puffin[:4], puffin[-4:]]: if magic_bytes != MAGIC_BYTES: raise ValueError(f"Incorrect magic bytes, expected {MAGIC_BYTES!r}, got {magic_bytes!r}") # One flag is set, the rest should be zero # byte 0 (first) # - bit 0 (lowest bit): whether FooterPayload is compressed # - all other bits are reserved for future use and should be set to 0 on write flags = puffin[-8:-4] if flags[0] != 0: raise ValueError("The Puffin-file has a compressed footer, which is not yet supported") # 4 byte integer is always signed, in a two's complement representation, stored little-endian. footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little") self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12]) puffin = puffin[8:] self._deletion_vectors = { blob.properties[PROPERTY_REFERENCED_DATA_FILE]: _deserialize_bitmap(puffin[blob.offset : blob.offset + blob.length]) for blob in self.footer.blobs } def to_vector(self) -> Dict[str, "pa.ChunkedArray"]: return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()}