arrow-buffer/src/buffer/offset.rs (49 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::buffer::ScalarBuffer;
use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder};
use std::ops::Deref;
/// A non-empty buffer of monotonically increasing, positive integers.
///
/// [`OffsetBuffer`] are used to represent ranges of offsets. An
/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start
/// offset for element `i` is `offsets[i]` and the end offset is
/// `offsets[i+1]`. Equal offsets represent an empty range.
///
/// # Example
///
/// This example shows how 5 distinct ranges, are represented using a
/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the
/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets
/// (e.g. an empty list).
///
/// ```text
/// ┌───────┐ ┌───┐
/// │ (0,3) │ │ 0 │
/// ├───────┤ ├───┤
/// │ (3,3) │ │ 3 │
/// ├───────┤ ├───┤
/// │ (3,4) │ │ 3 │
/// ├───────┤ ├───┤
/// │ (4,5) │ │ 4 │
/// ├───────┤ ├───┤
/// │ (5,7) │ │ 5 │
/// └───────┘ ├───┤
/// │ 7 │
/// └───┘
///
/// Offsets Buffer
/// Logical
/// Offsets
///
/// (offsets[i],
/// offsets[i+1])
/// ```
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>);
impl<O: ArrowNativeType> OffsetBuffer<O> {
/// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
///
/// # Panics
///
/// Panics if `buffer` is not a non-empty buffer containing
/// monotonically increasing values greater than or equal to zero
pub fn new(buffer: ScalarBuffer<O>) -> Self {
assert!(!buffer.is_empty(), "offsets cannot be empty");
assert!(
buffer[0] >= O::usize_as(0),
"offsets must be greater than 0"
);
assert!(
buffer.windows(2).all(|w| w[0] <= w[1]),
"offsets must be monotonically increasing"
);
Self(buffer)
}
/// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
///
/// # Safety
///
/// `buffer` must be a non-empty buffer containing monotonically increasing
/// values greater than or equal to zero
pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self {
Self(buffer)
}
/// Create a new [`OffsetBuffer`] containing a single 0 value
pub fn new_empty() -> Self {
let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>());
Self(buffer.into_buffer().into())
}
/// Create a new [`OffsetBuffer`] containing `len + 1` `0` values
pub fn new_zeroed(len: usize) -> Self {
let len_bytes = len
.checked_add(1)
.and_then(|o| o.checked_mul(std::mem::size_of::<O>()))
.expect("overflow");
let buffer = MutableBuffer::from_len_zeroed(len_bytes);
Self(buffer.into_buffer().into())
}
/// Create a new [`OffsetBuffer`] from the iterator of slice lengths
///
/// ```
/// # use arrow_buffer::OffsetBuffer;
/// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]);
/// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]);
/// ```
///
/// # Panics
///
/// Panics on overflow
pub fn from_lengths<I>(lengths: I) -> Self
where
I: IntoIterator<Item = usize>,
{
let iter = lengths.into_iter();
let mut out = Vec::with_capacity(iter.size_hint().0 + 1);
out.push(O::usize_as(0));
let mut acc = 0_usize;
for length in iter {
acc = acc.checked_add(length).expect("usize overflow");
out.push(O::usize_as(acc))
}
// Check for overflow
O::from_usize(acc).expect("offset overflow");
Self(out.into())
}
/// Get an Iterator over the lengths of this [`OffsetBuffer`]
///
/// ```
/// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
/// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
/// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
/// ```
///
/// Empty [`OffsetBuffer`] will return an empty iterator
/// ```
/// # use arrow_buffer::OffsetBuffer;
/// let offsets = OffsetBuffer::<i32>::new_empty();
/// assert_eq!(offsets.lengths().count(), 0);
/// ```
///
/// This can be used to merge multiple [`OffsetBuffer`]s to one
/// ```
/// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
///
/// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
/// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]);
///
/// let merged = OffsetBuffer::<i32>::from_lengths(
/// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths())
/// );
///
/// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]);
/// ```
pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize())
}
/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.0.shrink_to_fit();
}
/// Returns the inner [`ScalarBuffer`]
pub fn inner(&self) -> &ScalarBuffer<O> {
&self.0
}
/// Returns the inner [`ScalarBuffer`], consuming self
pub fn into_inner(self) -> ScalarBuffer<O> {
self.0
}
/// Returns a zero-copy slice of this buffer with length `len` and starting at `offset`
pub fn slice(&self, offset: usize, len: usize) -> Self {
Self(self.0.slice(offset, len.saturating_add(1)))
}
/// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons
/// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
/// return false when the arrays are logically equal
#[inline]
pub fn ptr_eq(&self, other: &Self) -> bool {
self.0.ptr_eq(&other.0)
}
}
impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
type Target = [T];
#[inline]
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> {
#[inline]
fn as_ref(&self) -> &[T] {
self
}
}
impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> {
fn from(value: OffsetBufferBuilder<O>) -> Self {
value.finish()
}
}
impl<O: ArrowNativeType> Default for OffsetBuffer<O> {
fn default() -> Self {
Self::new_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[should_panic(expected = "offsets cannot be empty")]
fn empty_offsets() {
OffsetBuffer::new(Vec::<i32>::new().into());
}
#[test]
#[should_panic(expected = "offsets must be greater than 0")]
fn negative_offsets() {
OffsetBuffer::new(vec![-1, 0, 1].into());
}
#[test]
fn offsets() {
OffsetBuffer::new(vec![0, 1, 2, 3].into());
let offsets = OffsetBuffer::<i32>::new_zeroed(3);
assert_eq!(offsets.as_ref(), &[0; 4]);
let offsets = OffsetBuffer::<i32>::new_zeroed(0);
assert_eq!(offsets.as_ref(), &[0; 1]);
}
#[test]
#[should_panic(expected = "overflow")]
fn offsets_new_zeroed_overflow() {
OffsetBuffer::<i32>::new_zeroed(usize::MAX);
}
#[test]
#[should_panic(expected = "offsets must be monotonically increasing")]
fn non_monotonic_offsets() {
OffsetBuffer::new(vec![1, 2, 0].into());
}
#[test]
fn from_lengths() {
let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]);
let half_max = i32::MAX / 2;
let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]);
assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]);
}
#[test]
#[should_panic(expected = "offset overflow")]
fn from_lengths_offset_overflow() {
OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]);
}
#[test]
#[should_panic(expected = "usize overflow")]
fn from_lengths_usize_overflow() {
OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]);
}
#[test]
fn get_lengths() {
let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
}
#[test]
fn get_lengths_should_be_with_fixed_size() {
let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
let iter = offsets.lengths();
assert_eq!(iter.size_hint(), (3, Some(3)));
assert_eq!(iter.len(), 3);
}
#[test]
fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() {
let offsets = OffsetBuffer::<i32>::new_empty();
assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]);
}
#[test]
fn impl_eq() {
fn are_equal<T: Eq>(a: &T, b: &T) -> bool {
a.eq(b)
}
assert!(
are_equal(
&OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])),
&OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]))
),
"OffsetBuffer should implement Eq."
);
}
#[test]
fn impl_default() {
let default = OffsetBuffer::<i32>::default();
assert_eq!(default.as_ref(), &[0]);
}
}