velox/vector/BaseVector.h (444 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
#include <fmt/format.h>
#include <folly/Format.h>
#include <folly/Range.h>
#include <folly/container/F14Map.h>
#include "velox/buffer/Buffer.h"
#include "velox/common/base/BitUtil.h"
#include "velox/common/base/Nulls.h"
#include "velox/type/Type.h"
#include "velox/type/Variant.h"
#include "velox/vector/BuilderTypeUtils.h"
#include "velox/vector/SelectivityVector.h"
#include "velox/vector/TypeAliases.h"
#include "velox/vector/VectorEncoding.h"
#include "velox/vector/VectorStream.h"
#include "velox/vector/VectorUtil.h"
namespace facebook {
namespace velox {
namespace cdvi {
const folly::F14FastMap<std::string, std::string> EMPTY_METADATA;
} // namespace cdvi
// Describes value collation in comparison. If equalsOnly is true, comparison
// can return non-0 early, for example only after considering string length.
struct CompareFlags {
bool nullsFirst = true;
bool ascending = true;
bool equalsOnly = false;
};
template <typename T>
class SimpleVector;
template <typename T>
class FlatVector;
/**
* Base class for all columnar-based vectors of any type.
*/
class BaseVector {
public:
static constexpr SelectivityVector* kPreserveAll = nullptr;
static constexpr uint64_t kNullHash = 1;
enum SerializeOp { kWrite, kRead, kCompare };
BaseVector(
velox::memory::MemoryPool* pool,
std::shared_ptr<const Type> type,
BufferPtr nulls,
size_t length,
std::optional<vector_size_t> distinctValueCount = std::nullopt,
std::optional<vector_size_t> nullCount = std::nullopt,
std::optional<ByteCount> representedByteCount = std::nullopt,
std::optional<ByteCount> storageByteCount = std::nullopt);
virtual ~BaseVector() = default;
virtual VectorEncoding::Simple encoding() const = 0;
inline bool isLazy() const {
return encoding() == VectorEncoding::Simple::LAZY;
}
// Returns false if vector has no nulls. Return true if vector may have nulls.
// When this method returns true, flatRawNulls is guaranteed to return
// non-null.
virtual bool mayHaveNulls() const {
return rawNulls_;
}
// Returns false if this vector and all of its children have no nulls. Returns
// true if this vector or any of its children may have nulls.
// When this method returns true, flatRawNulls called on this vector or any
// of its children is guaranteed to return non-null.
virtual bool mayHaveNullsRecursive() const {
return mayHaveNulls();
}
// Returns raw nulls or nullptr with one bit per logical position in
// the vector, with valid values at least for the rows selected in
// 'rows'. Dictionaries, sequences and constants may calculate this on
// demand.
virtual const uint64_t* flatRawNulls(const SelectivityVector& rows) {
return rawNulls_;
}
inline bool isIndexInRange(vector_size_t index) const {
// This compiles better than index >= 0 && index < length_.
return static_cast<uint32_t>(index) < length_;
}
template <typename T>
T* as() {
static_assert(std::is_base_of<BaseVector, T>::value);
return dynamic_cast<T*>(this);
}
template <typename T>
const T* as() const {
static_assert(std::is_base_of<BaseVector, T>::value);
return dynamic_cast<const T*>(this);
}
// Use when the type of 'this' is already known. dynamic_cast() is slow.
template <typename T>
T* asUnchecked() {
static_assert(std::is_base_of<BaseVector, T>::value);
return static_cast<T*>(this);
}
template <typename T>
const T* asUnchecked() const {
static_assert(std::is_base_of<BaseVector, T>::value);
return static_cast<const T*>(this);
}
template <typename T>
const FlatVector<T>* asFlatVector() const {
return dynamic_cast<const FlatVector<T>*>(this);
}
template <typename T>
FlatVector<T>* asFlatVector() {
return dynamic_cast<FlatVector<T>*>(this);
}
velox::memory::MemoryPool* pool() const {
return pool_;
}
virtual bool isNullAt(vector_size_t idx) const {
VELOX_DCHECK(isIndexInRange(idx));
return rawNulls_ ? bits::isBitNull(rawNulls_, idx) : false;
}
std::optional<vector_size_t> getNullCount() const {
return nullCount_;
}
void setNullCount(vector_size_t newNullCount) {
nullCount_ = newNullCount;
}
const std::shared_ptr<const Type>& type() const {
return type_;
}
TypeKind typeKind() const {
return typeKind_;
}
/**
* Returns a smart pointer to the null bitmap data for this
* vector. May hold nullptr if there are no nulls. Not const because
* some vectors may generate this on first access.
*/
const BufferPtr& nulls() const {
return nulls_;
}
const uint64_t* rawNulls() const {
return rawNulls_;
}
uint64_t* mutableRawNulls() {
ensureNulls();
return const_cast<uint64_t*>(rawNulls_);
}
virtual BufferPtr mutableNulls(vector_size_t size) {
if (nulls_ && nulls_->capacity() >= bits::nbytes(size)) {
return nulls_;
}
if (nulls_) {
AlignedBuffer::reallocate<bool>(&nulls_, size, false);
} else {
nulls_ = AlignedBuffer::allocate<bool>(size, pool_, false);
}
rawNulls_ = nulls_->as<uint64_t>();
return nulls_;
}
std::optional<vector_size_t> getDistinctValueCount() const {
return distinctValueCount_;
}
/**
* @return the number of rows of data in this vector
*/
vector_size_t size() const {
return length_;
}
void setSize(vector_size_t newSize) {
length_ = newSize;
}
virtual void append(const BaseVector* other) {
auto totalSize = BaseVector::length_ + other->size();
auto previousSize = BaseVector::size();
resize(totalSize);
copy(other, previousSize, 0, other->size());
}
/**
* @return the number of bytes this vector takes on disk when in a compressed
* and serialized format
*/
std::optional<ByteCount> storageBytes() const {
return storageByteCount_;
}
/**
* @return the number of bytes required to naively represent all of the data
* in this vector - the raw data size if not in a compressed or otherwise
* optimized format
*/
std::optional<ByteCount> representedBytes() const {
return representedByteCount_;
}
/**
* @return the number of bytes required to hold this vector in memory
*/
ByteCount inMemoryBytes() const {
return inMemoryBytes_;
}
/**
* @return true if this vector has the same value at the given index as the
* other vector at the other vector's index (including if both are null),
* false otherwise
*/
virtual bool equalValueAt(
const BaseVector* other,
vector_size_t index,
vector_size_t otherIndex) const {
static constexpr CompareFlags kEqualValueAtFlags = {
false, false, true /*equalOnly*/};
return compare(other, index, otherIndex, kEqualValueAtFlags) == 0;
}
// Returns < 0 if 'this' at 'index' is less than 'other' at
// 'otherIndex', 0 if equal and > 0 otherwise.
virtual int32_t compare(
const BaseVector* other,
vector_size_t index,
vector_size_t otherIndex,
CompareFlags flags = CompareFlags()) const = 0;
/**
* @return the hash of the value at the given index in this vector
*/
virtual uint64_t hashValueAt(vector_size_t index) const = 0;
/**
* @return a new vector that contains the hashes for all entries
*/
virtual std::unique_ptr<SimpleVector<uint64_t>> hashAll() const = 0;
// Returns true if all values in the specified rows are the same.
virtual bool isConstant(const SelectivityVector& rows) const {
return false;
}
// Returns true if this vector is encoded as constant (ConstantVector).
virtual bool isConstantEncoding() const {
return false;
}
// Returns true if this vector has a scalar type. If so, values are
// accessed by valueAt after casting the vector to a type()
// dependent instantiation of SimpleVector<T>.
virtual bool isScalar() const {
return false;
}
// Returns the scalar or complex vector wrapped inside any nesting of
// dictionary, sequence or constant vectors.
virtual const BaseVector* wrappedVector() const {
return this;
}
// Returns the index to apply for 'index' in the vector returned by
// wrappedVector(). Translates the index over any nesting of
// dictionaries, sequences and constants.
virtual vector_size_t wrappedIndex(vector_size_t index) const {
return index;
}
// Sets the null indicator at 'idx'. 'true' means null.
FOLLY_ALWAYS_INLINE virtual void setNull(vector_size_t idx, bool value) {
VELOX_DCHECK(idx >= 0 && idx < length_);
if (!nulls_) {
if (!value) {
return;
}
allocateNulls();
}
bits::setBit(
nulls_->asMutable<uint64_t>(), idx, bits::kNull ? value : !value);
}
static int32_t
countNulls(const BufferPtr& nulls, vector_size_t begin, vector_size_t end) {
return nulls ? bits::countNulls(nulls->as<uint64_t>(), begin, end) : 0;
}
static int32_t countNulls(const BufferPtr& nulls, vector_size_t size) {
return countNulls(nulls, 0, size);
}
virtual bool mayAddNulls() const {
return true;
}
// Sets null when 'nulls' has null value for a row in 'rows'
virtual void addNulls(const uint64_t* bits, const SelectivityVector& rows);
// Clears null when 'nulls' has non-null value for a row in 'rows'
virtual void clearNulls(const SelectivityVector& rows);
virtual void clearNulls(vector_size_t begin, vector_size_t end);
void clearAllNulls() {
clearNulls(0, size());
}
// Sets the size to 'size' and ensures there is space for the
// indicated number of nulls and top level values.
// 'setNotNull' indicates if nulls in range [oldSize, newSize) should be set
// to not null.
virtual void resize(vector_size_t newSize, bool setNotNull = true);
// Sets the rows of 'this' given by 'rows' to
// 'source.valueAt(toSourceRow ? toSourceRow[row] : row)', where
// 'row' iterates over 'rows'.
virtual void copy(
const BaseVector* source,
const SelectivityVector& rows,
const vector_size_t* toSourceRow) {
rows.applyToSelected([&](vector_size_t row) {
auto sourceRow = toSourceRow ? toSourceRow[row] : row;
if (source->isNullAt(sourceRow)) {
setNull(row, true);
} else {
copy(source, row, sourceRow, 1);
}
});
}
// Move or copy an element at 'source' row into 'target' row.
// This can be more efficient than copy for complex types.
virtual void move(vector_size_t source, vector_size_t target) {
VELOX_CHECK_LT(source, size());
VELOX_CHECK_LT(target, size());
if (source != target) {
copy(this, target, source, 1);
}
}
virtual void copy(
const BaseVector* source,
vector_size_t targetIndex,
vector_size_t sourceIndex,
vector_size_t count) {
VELOX_NYI();
}
// Returns a vector of the type of 'source' where 'indices' contains
// an index into 'source' for each element of 'source'. The
// resulting vector has position i set to source[i]. This is
// equivalent to wrapping 'source' in a dictionary with 'indices'
// but this may reuse structure if said structure is uniquely owned
// or if a copy is more efficient than dictionary wrapping.
static std::shared_ptr<BaseVector> transpose(
BufferPtr indices,
std::shared_ptr<BaseVector>&& source);
static std::shared_ptr<BaseVector> createConstant(
variant value,
vector_size_t size,
velox::memory::MemoryPool* pool);
static std::shared_ptr<BaseVector> createNullConstant(
const TypePtr& type,
vector_size_t size,
velox::memory::MemoryPool* pool);
static std::shared_ptr<BaseVector> wrapInDictionary(
BufferPtr nulls,
BufferPtr indices,
vector_size_t size,
std::shared_ptr<BaseVector> vector);
static std::shared_ptr<BaseVector> wrapInSequence(
BufferPtr lengths,
vector_size_t size,
std::shared_ptr<BaseVector> vector);
// Creates a ConstantVector of specified length and value coming from the
// 'index' element of the 'vector'. Peels off any encodings of the 'vector'
// before making a new ConstantVector. The result vector is either a
// ConstantVector holding a scalar value or a ConstantVector wrapping flat or
// lazy vector. The result cannot be a wrapping over another constant or
// dictionary vector.
static std::shared_ptr<BaseVector> wrapInConstant(
vector_size_t length,
vector_size_t index,
std::shared_ptr<BaseVector> vector);
// Makes 'result' writable for 'rows'. A wrapper (e.g. dictionary, constant,
// sequence) is flattened and a multiply referenced flat vector is copied.
// The content of 'rows' is not copied, as these values are intended to be
// overwritten.
//
// After invoking this function, the 'result' is guaranteed to be a flat
// uniquely-referenced vector.
//
// Use SelectivityVector::empty() to make the 'result' writable and preserve
// all current values.
static void ensureWritable(
const SelectivityVector& rows,
const TypePtr& type,
velox::memory::MemoryPool* pool,
std::shared_ptr<BaseVector>* result);
virtual void ensureWritable(const SelectivityVector& rows);
// Flattens the input vector.
//
// TODO: This method reuses ensureWritable(), which ensures that both:
// (a) the vector is flattened, and
// (b) it's singly-referenced
//
// We don't necessarily need (b) if we only want to flatten vectors.
static void flattenVector(
std::shared_ptr<BaseVector>* vector,
size_t vectorSize) {
BaseVector::ensureWritable(
SelectivityVector::empty(vectorSize),
(*vector)->type(),
(*vector)->pool(),
vector);
}
template <typename T>
static inline uint64_t byteSize(vector_size_t count) {
return sizeof(T) * count;
}
// If 'vector' is a wrapper, returns the underlying values vector. This is
// virtual and defined here because we must be able to access this in type
// agnostic code without a switch on all data types.
virtual std::shared_ptr<BaseVector> valueVector() const {
throw std::runtime_error("Vector is not a wrapper");
}
virtual BaseVector* loadedVector() {
return this;
}
virtual const BaseVector* loadedVector() const {
return this;
}
static std::shared_ptr<BaseVector> loadedVectorShared(
std::shared_ptr<BaseVector>);
virtual const BufferPtr& values() const {
throw std::runtime_error("Only flat vectors have a values buffer");
}
virtual const void* valuesAsVoid() const {
throw std::runtime_error("Only flat vectors have a values buffer");
}
// If 'this' is a wrapper, returns the wrap info, interpretation depends on
// encoding.
virtual BufferPtr wrapInfo() const {
throw std::runtime_error("Vector is not a wrapper");
}
static std::shared_ptr<BaseVector> create(
const TypePtr& type,
vector_size_t size,
velox::memory::MemoryPool* pool);
static std::shared_ptr<BaseVector> getOrCreateEmpty(
std::shared_ptr<BaseVector> vector,
const TypePtr& type,
velox::memory::MemoryPool* pool) {
return vector ? vector : create(type, 0, pool);
}
BufferPtr* mutableNulls() {
return &nulls_;
}
void resetNulls() {
setNulls(nullptr);
}
// Ensures that '*indices' has space for 'size' elements. Sets
// elements between the old and new sizes to 'initialValue' if the
// new size > old size. If memory is moved, '*raw' is maintained to
// point to element 0 of (*indices)->as<vector_size_t>().
void resizeIndices(
vector_size_t size,
vector_size_t initialValue,
BufferPtr* indices,
const vector_size_t** raw) {
resizeIndices(size, initialValue, this->pool(), indices, raw);
}
static void resizeIndices(
vector_size_t size,
vector_size_t initialValue,
velox::memory::MemoryPool* pool,
BufferPtr* indices,
const vector_size_t** raw);
// Makes sure '*buffer' has space for 'size' items of T and is writable. Sets
// 'raw' to point to the writable contents of '*buffer'.
template <typename T, typename RawT>
static void ensureBuffer(
vector_size_t size,
velox::memory::MemoryPool* pool,
BufferPtr* buffer,
RawT** raw) {
vector_size_t minBytes = byteSize<T>(size);
if (*buffer && (*buffer)->capacity() >= minBytes && (*buffer)->unique()) {
(*buffer)->setSize(minBytes);
if (raw) {
*raw = (*buffer)->asMutable<RawT>();
}
return;
}
if (*buffer) {
AlignedBuffer::reallocate<T>(buffer, size);
} else {
*buffer = AlignedBuffer::allocate<T>(size, pool);
}
if (raw) {
*raw = (*buffer)->asMutable<RawT>();
}
(*buffer)->setSize(minBytes);
}
// Returns the byte size of memory that is kept live through 'this'.
virtual uint64_t retainedSize() const {
return nulls_ ? nulls_->capacity() : 0;
}
/// Returns an estimate of the 'retainedSize' of a flat representation of the
/// data stored in this vector. Returns zero if this is a lazy vector that
/// hasn't been loaded yet.
virtual uint64_t estimateFlatSize() const;
// Returns true if 'vector' is a unique reference to a flat vector
// and nulls and values are uniquely referenced.
static bool isReusableFlatVector(const std::shared_ptr<BaseVector>& vector);
/// To safely reuse a vector one needs to (1) ensure that the vector as well
/// as all its buffers and child vectors are singly-referenced and mutable
/// (for buffers); (2) clear append-only string buffers and child vectors
/// (elements of arrays, keys and values of maps, fields of structs).
///
/// This method takes a non-const reference to a 'vector' and updates it to
/// possibly a new flat vector of the specified size that is safe to reuse. If
/// input 'vector' is not singly-referenced or not flat, replaces 'vector'
/// with a new vector of the same type and specified size. If some of the
/// buffers cannot be reused, these buffers are reset. Child vectors are
/// updated by calling this method recursively with size zero.
static void prepareForReuse(
std::shared_ptr<BaseVector>& vector,
vector_size_t size);
/// Resets non-reusable buffers and updates child vectors by calling
/// BaseVector::prepareForReuse.
/// Base implementation checks and resets nulls buffer if needed. Keeps the
/// nulls buffer if singly-referenced, mutable and has at least one null bit
/// set.
virtual void prepareForReuse();
// True if left and right are the same or if right is
// TypeKind::UNKNOWN. ArrayVector copying may come across unknown
// type data for null-only content. Nulls can be transferred between
// two unknowns but values cannot be assigned into an unknown 'left'
// from a not-unknown 'right'.
static bool compatibleKind(TypeKind left, TypeKind right) {
return left == right || right == TypeKind::UNKNOWN;
}
virtual std::string toString() const;
virtual std::string toString(vector_size_t index) const;
std::string toString(vector_size_t from, vector_size_t to);
void setCodegenOutput() {
isCodegenOutput_ = true;
}
bool isCodegenOutput() const {
return isCodegenOutput_;
}
protected:
void ensureNulls() {
if (!nulls_) {
allocateNulls();
}
}
void allocateNulls();
void setNulls(BufferPtr nulls) {
nulls_ = nulls;
rawNulls_ = nulls ? nulls->as<uint64_t>() : nullptr;
nullCount_ = std::nullopt;
}
std::shared_ptr<const Type> type_;
TypeKind typeKind_;
BufferPtr nulls_;
// Caches raw pointer to 'nulls->as<uint64_t>().
const uint64_t* rawNulls_ = nullptr;
velox::memory::MemoryPool* pool_;
vector_size_t length_ = 0;
/**
* Holds the number of nulls in the vector. If the number of nulls
* is not available, it is set to std::nullopt. Setting the value to
* zero does have implications (SIMD operations need null count to be
* zero) and is not the same as std::nullopt.
*/
std::optional<vector_size_t> nullCount_;
std::optional<vector_size_t> distinctValueCount_;
std::optional<ByteCount> representedByteCount_;
std::optional<ByteCount> storageByteCount_;
ByteCount inMemoryBytes_ = 0;
private:
bool isCodegenOutput_ = false;
friend class LazyVector;
};
template <>
uint64_t BaseVector::byteSize<bool>(vector_size_t count);
using VectorPtr = std::shared_ptr<BaseVector>;
// Returns true if vector is a Lazy vector, possibly wrapped, that hasn't
// been loaded yet.
bool isLazyNotLoaded(const BaseVector& vector);
// Allocates a buffer to fit at least 'size' indices and initializes them to
// zero.
inline BufferPtr allocateIndices(vector_size_t size, memory::MemoryPool* pool) {
return AlignedBuffer::allocate<vector_size_t>(size, pool, 0);
}
} // namespace velox
} // namespace facebook
namespace folly {
// Allow VectorEncoding::Simple to be transparently used by folly::sformat.
// e.g: folly::sformat("type: {}", encodingType);
template <>
class FormatValue<facebook::velox::VectorEncoding::Simple> {
public:
explicit FormatValue(const facebook::velox::VectorEncoding::Simple& type)
: type_(type) {}
template <typename FormatCallback>
void format(FormatArg& arg, FormatCallback& cb) const {
return format_value::formatString(
facebook::velox::VectorEncoding::mapSimpleToName(type_), arg, cb);
}
private:
facebook::velox::VectorEncoding::Simple type_;
};
} // namespace folly
template <>
struct fmt::formatter<facebook::velox::VectorEncoding::Simple> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
template <typename FormatContext>
auto format(
const facebook::velox::VectorEncoding::Simple& x,
FormatContext& ctx) {
return format_to(
ctx.out(), "{}", facebook::velox::VectorEncoding::mapSimpleToName(x));
}
};