arrow-avro/src/reader/vlq.rs (88 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. /// Decoder for zig-zag encoded variable length (VLW) integers /// /// See also: /// <https://avro.apache.org/docs/1.11.1/specification/#primitive-types-1> /// <https://protobuf.dev/programming-guides/encoding/#varints> #[derive(Debug, Default)] pub struct VLQDecoder { /// Scratch space for decoding VLQ integers in_progress: u64, shift: u32, } impl VLQDecoder { /// Decode a signed long from `buf` pub fn long(&mut self, buf: &mut &[u8]) -> Option<i64> { while let Some(byte) = buf.first().copied() { *buf = &buf[1..]; self.in_progress |= ((byte & 0x7F) as u64) << self.shift; self.shift += 7; if byte & 0x80 == 0 { let val = self.in_progress; self.in_progress = 0; self.shift = 0; return Some((val >> 1) as i64 ^ -((val & 1) as i64)); } } None } } /// Read a varint from `buf` returning the decoded `u64` and the number of bytes read #[inline] pub(crate) fn read_varint(buf: &[u8]) -> Option<(u64, usize)> { let first = *buf.first()?; if first < 0x80 { return Some((first as u64, 1)); } if let Some(array) = buf.get(..10) { return read_varint_array(array.try_into().unwrap()); } read_varint_slow(buf) } /// Based on /// - <https://github.com/tokio-rs/prost/blob/master/prost/src/encoding/varint.rs#L71> /// - <https://github.com/google/protobuf/blob/3.3.x/src/google/protobuf/io/coded_stream.cc#L365-L406> /// - <https://github.com/protocolbuffers/protobuf-go/blob/v1.27.1/encoding/protowire/wire.go#L358> #[inline] fn read_varint_array(buf: [u8; 10]) -> Option<(u64, usize)> { let mut in_progress = 0_u64; for (idx, b) in buf.into_iter().take(9).enumerate() { in_progress += (b as u64) << (7 * idx); if b < 0x80 { return Some((in_progress, idx + 1)); } in_progress -= 0x80 << (7 * idx); } let b = buf[9] as u64; in_progress += b << (7 * 9); (b < 0x02).then_some((in_progress, 10)) } #[inline(never)] #[cold] fn read_varint_slow(buf: &[u8]) -> Option<(u64, usize)> { let mut value = 0; for (count, byte) in buf.iter().take(10).enumerate() { let byte = buf[count]; value |= u64::from(byte & 0x7F) << (count * 7); if byte <= 0x7F { // Check for u64::MAX overflow. See [`ConsumeVarint`][1] for details. // [1]: https://github.com/protocolbuffers/protobuf-go/blob/v1.27.1/encoding/protowire/wire.go#L358 return (count != 9 || byte < 2).then_some((value, count + 1)); } } None } #[cfg(test)] mod tests { use super::*; fn encode_var(mut n: u64, dst: &mut [u8]) -> usize { let mut i = 0; while n >= 0x80 { dst[i] = 0x80 | (n as u8); i += 1; n >>= 7; } dst[i] = n as u8; i + 1 } fn varint_test(a: u64) { let mut buf = [0_u8; 10]; let len = encode_var(a, &mut buf); assert_eq!(read_varint(&buf[..len]).unwrap(), (a, len)); assert_eq!(read_varint(&buf).unwrap(), (a, len)); } #[test] fn test_varint() { varint_test(0); varint_test(4395932); varint_test(u64::MAX); for _ in 0..1000 { varint_test(rand::random()); } } }