pyiceberg/table/puffin.py (67 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import math
from typing import TYPE_CHECKING, Dict, List, Literal, Optional
from pydantic import Field
from pyroaring import BitMap, FrozenBitMap
from pyiceberg.typedef import IcebergBaseModel
if TYPE_CHECKING:
import pyarrow as pa
# Short for: Puffin Fratercula arctica, version 1
MAGIC_BYTES = b"PFA1"
EMPTY_BITMAP = FrozenBitMap()
MAX_JAVA_SIGNED = int(math.pow(2, 31)) - 1
PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file"
def _deserialize_bitmap(pl: bytes) -> List[BitMap]:
number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little")
pl = pl[8:]
bitmaps = []
last_key = -1
for _ in range(number_of_bitmaps):
key = int.from_bytes(pl[0:4], byteorder="little")
if key < 0:
raise ValueError(f"Invalid unsigned key: {key}")
if key <= last_key:
raise ValueError("Keys must be sorted in ascending order")
if key > MAX_JAVA_SIGNED:
raise ValueError(f"Key {key} is too large, max {MAX_JAVA_SIGNED} to maintain compatibility with Java impl")
pl = pl[4:]
while last_key < key - 1:
bitmaps.append(EMPTY_BITMAP)
last_key += 1
bm = BitMap().deserialize(pl)
# TODO: Optimize this
pl = pl[len(bm.serialize()) :]
bitmaps.append(bm)
last_key = key
return bitmaps
class PuffinBlobMetadata(IcebergBaseModel):
type: Literal["deletion-vector-v1"] = Field()
fields: List[int] = Field()
snapshot_id: int = Field(alias="snapshot-id")
sequence_number: int = Field(alias="sequence-number")
offset: int = Field()
length: int = Field()
compression_codec: Optional[str] = Field(alias="compression-codec", default=None)
properties: Dict[str, str] = Field(default_factory=dict)
class Footer(IcebergBaseModel):
blobs: List[PuffinBlobMetadata] = Field()
properties: Dict[str, str] = Field(default_factory=dict)
def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray":
import pyarrow as pa
return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps))
class PuffinFile:
footer: Footer
_deletion_vectors: Dict[str, List[BitMap]]
def __init__(self, puffin: bytes) -> None:
for magic_bytes in [puffin[:4], puffin[-4:]]:
if magic_bytes != MAGIC_BYTES:
raise ValueError(f"Incorrect magic bytes, expected {MAGIC_BYTES!r}, got {magic_bytes!r}")
# One flag is set, the rest should be zero
# byte 0 (first)
# - bit 0 (lowest bit): whether FooterPayload is compressed
# - all other bits are reserved for future use and should be set to 0 on write
flags = puffin[-8:-4]
if flags[0] != 0:
raise ValueError("The Puffin-file has a compressed footer, which is not yet supported")
# 4 byte integer is always signed, in a two's complement representation, stored little-endian.
footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little")
self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12])
puffin = puffin[8:]
self._deletion_vectors = {
blob.properties[PROPERTY_REFERENCED_DATA_FILE]: _deserialize_bitmap(puffin[blob.offset : blob.offset + blob.length])
for blob in self.footer.blobs
}
def to_vector(self) -> Dict[str, "pa.ChunkedArray"]:
return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()}