pyiceberg/utils/bin_packing.py (92 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
from typing import (
Callable,
Generic,
Iterable,
List,
Optional,
TypeVar,
)
T = TypeVar("T")
class Bin(Generic[T]):
def __init__(self, target_weight: int) -> None:
self.bin_weight = 0
self.target_weight = target_weight
self.items: List[T] = []
def weight(self) -> int:
return self.bin_weight
def can_add(self, weight: int) -> bool:
return self.bin_weight + weight <= self.target_weight
def add(self, item: T, weight: int) -> None:
self.bin_weight += weight
self.items.append(item)
class PackingIterator(Generic[T]):
bins: List[Bin[T]]
def __init__(
self,
items: Iterable[T],
target_weight: int,
lookback: int,
weight_func: Callable[[T], int],
largest_bin_first: bool = False,
) -> None:
self.items = iter(items)
self.target_weight = target_weight
self.lookback = lookback
self.weight_func = weight_func
self.largest_bin_first = largest_bin_first
self.bins = []
def __iter__(self) -> PackingIterator[T]:
"""Return an iterator for the PackingIterator class."""
return self
def __next__(self) -> List[T]:
"""Return the next item when iterating over the PackingIterator class."""
while True:
try:
item = next(self.items)
weight = self.weight_func(item)
bin_ = self.find_bin(weight)
if bin_ is not None:
bin_.add(item, weight)
else:
bin_ = Bin(self.target_weight)
bin_.add(item, weight)
self.bins.append(bin_)
if len(self.bins) > self.lookback:
return self.remove_bin().items
except StopIteration:
break
if len(self.bins) == 0:
raise StopIteration()
return self.remove_bin().items
def find_bin(self, weight: int) -> Optional[Bin[T]]:
for bin_ in self.bins:
if bin_.can_add(weight):
return bin_
return None
def remove_bin(self) -> Bin[T]:
if self.largest_bin_first:
bin_ = max(self.bins, key=lambda b: b.weight())
self.bins.remove(bin_)
return bin_
else:
return self.bins.pop(0)
class ListPacker(Generic[T]):
_target_weight: int
_lookback: int
_largest_bin_first: bool
def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) -> None:
self._target_weight = target_weight
self._lookback = lookback
self._largest_bin_first = largest_bin_first
def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]:
return list(
PackingIterator(
items=items,
target_weight=self._target_weight,
lookback=self._lookback,
weight_func=weight_func,
largest_bin_first=self._largest_bin_first,
)
)
def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]:
packed = self.pack(items=list(reversed(items)), weight_func=weight_func)
return [list(reversed(bin_items)) for bin_items in reversed(packed)]