query/iterator.hpp

// Copyright (c) 2017-2018 Uber Technologies, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef QUERY_ITERATOR_HPP_ #define QUERY_ITERATOR_HPP_ #include <cuda_runtime.h> #include <thrust/detail/internal_functional.h> #include <thrust/iterator/constant_iterator.h> #include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/permutation_iterator.h> #include <thrust/iterator/zip_iterator.h> #include <thrust/tuple.h> #include <thrust/iterator/detail/normal_iterator.h> #include <cfloat> #include <cmath> #include <tuple> #include "query/time_series_aggregate.h" #include "utils.hpp" namespace ares { // VectorPartyIterator iterates <value,validity> tuples against a column // (vector parties) under 3 modes: // 1: all values are present // 2: uncompressed // 3: compressed // based on the presences of 3 vectors (values, nulls, counts). // Only 4 type of values vector is supported here: // Uint32, Int32, Float32 and Bool. Bool vector is packed into 1 bit. // More details // can be found here: // https://github.com/uber/aresdb/wiki/VectorStore // // This struct aims at reduce overall struct size so that we will use as few // registers per thread as possible when launching kernels. // We made following efforts to reduce overall size: // 1. Put in a single struct to avoid too much padding. // 2. Passing values pointers only and passing nulls and counts as offset to // value pointers. When allocating device space, we will store values, nulls // and counts consecutively. // 3. Base counts and start count will share the same memory address. // // Mode 0 vp is modeled as constant vector so value pointer always presents. // Length is only used by compressed column. // // The mode of the column is judged by following logic: // If valuesOffset == 0, it means it’s a mode 1 vector. // Otherwise if nullsOffset == 0, it’s a mode 2 vector. // Otherwise it’s a mode 3 vector. template <typename Value> class VectorPartyIterator : public thrust::iterator_adaptor< VectorPartyIterator<Value>, uint32_t *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<VectorPartyIterator<Value>, uint32_t *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> super_t; __host__ __device__ VectorPartyIterator() {} // base is counts vector if mode 0. nulls vector if mode 2, values vector // if mode 0. __host__ __device__ VectorPartyIterator( uint32_t *baseCounts, const uint32_t startCount, uint8_t *basePtr, uint32_t nullsOffset, uint32_t valuesOffset, uint32_t length, uint8_t stepInBytes, uint8_t nullBitOffset) : super_t(baseCounts != nullptr ? baseCounts : reinterpret_cast<uint32_t *>(startCount)), basePtr(basePtr), nullsOffset(nullsOffset), valuesOffset(valuesOffset), length(length), currentPtrIndex(0), stepInBytes(stepInBytes), nullBitOffset(nullBitOffset) { mode = judge_mode(); hasBaseCounts = baseCounts != nullptr; } private: uint8_t *basePtr; uint32_t nullsOffset; // Offset of nullsPtr. uint32_t valuesOffset; // Offset of basePtr. uint32_t currentPtrIndex; uint32_t length; uint8_t stepInBytes; uint8_t hasBaseCounts; // Whether the base iterator is a base count vector or // a start count. uint8_t nullBitOffset; // Starting bit offset in null vector. uint8_t mode; // For convenience we store here as this does not add to the // final space of this iterator. __host__ __device__ uint8_t judge_mode() const { if (valuesOffset == 0) { return 1; } if (nullsOffset == 0) { return 2; } return 3; } // offset must be greater than 0, otherwise it may lead to access to undesired // location. __host__ __device__ bool get_null() const { if (mode >= 2) { return get_bool(basePtr + nullsOffset); } return true; } __host__ __device__ bool get_bool(uint8_t *ptr) const { uint32_t nullIndex = currentPtrIndex + nullBitOffset; return ptr[nullIndex / 8] & (1 << nullIndex % 8); } // overloaded functions for different value types. __host__ __device__ uint32_t get_value(uint32_t *values) const { uint8_t *ptr = reinterpret_cast<uint8_t *>(values) + currentPtrIndex * stepInBytes; switch (stepInBytes) { case 2:return *reinterpret_cast<uint16_t *>(ptr); case 4:return *reinterpret_cast<uint32_t *>(ptr); default:return *ptr; } } __host__ __device__ int32_t get_value(int32_t *values) const { int8_t *ptr = reinterpret_cast<int8_t *>(values) + currentPtrIndex * stepInBytes; switch (stepInBytes) { case 2:return *reinterpret_cast<int16_t *>(ptr); case 4:return *reinterpret_cast<int32_t *>(ptr); default:return *ptr; } } __host__ __device__ float_t get_value(float_t *values) const { return values[currentPtrIndex]; } __host__ __device__ GeoPointT get_value(GeoPointT *values) const { return values[currentPtrIndex]; } __host__ __device__ int64_t get_value(int64_t *values) const { return values[currentPtrIndex]; } __host__ __device__ UUIDT get_value(UUIDT *values) const { return values[currentPtrIndex]; } __host__ __device__ bool get_value(bool *values) const { return get_bool(reinterpret_cast<uint8_t *>(values)); } __host__ __device__ Value get_value() const { return get_value(reinterpret_cast<Value *>( basePtr + valuesOffset)); } // Should be called only if it's a mode 3 vector. __host__ __device__ uint32_t get_count(uint32_t index) const { return reinterpret_cast<uint32_t *>(basePtr)[currentPtrIndex + index]; } __host__ __device__ thrust::tuple<Value, bool> dereference() const { return thrust::make_tuple(get_value(), get_null()); } // For compressed vp, moving strategy is like following steps: // 1. if the offset is less than next count, we will not move the iterator. // 2. if the offset is less than the warp size(32 or 64), we will do a // linear search. // 3. otherwise, we will do a binary search to go to the correct position. // It works best if n is larger than 0 since we will do quick check and // linear check, otherwise we will do binary search for negative n. __host__ __device__ void advance(typename super_t::difference_type n) { if (hasBaseCounts) { this->base_reference() += n; } else { this->base_reference() = reinterpret_cast<uint32_t *>( reinterpret_cast<uintptr_t >(this->base_reference()) + n); } if (mode == 3) { uint32_t newIndex; if (hasBaseCounts) { newIndex = *this->base_reference(); } else { newIndex = reinterpret_cast<uintptr_t>(this->base_reference()); } // quick check to avoid binary search if (newIndex >= get_count(0) && newIndex < get_count(1)) { return; } // do linear search if n is less than or equal to wrap size. if (n <= WARP_SIZE && n >= 0) { for (int i = 1; i < length - currentPtrIndex; i++) { if (newIndex >= get_count(i) && newIndex < get_count(i + 1)) { currentPtrIndex += i; return; } } } uint32_t first = currentPtrIndex; uint32_t last = length; if (n < 0) { first = 0; last = currentPtrIndex; } // this algorithm computes upper_bound - 1 of [first, last), // which is the last element in the [first, last) that is // smaller or equal to the given index while (first < last) { uint32_t mid = first + (last - first) / 2; if (get_count(mid) > newIndex) { last = mid; } else { first = mid + 1; } } currentPtrIndex = first - 1; } else { // If this column is not compressed, this means the counting // iterator is simply thrust counting iterator since base count // column should be also uncompressed as well. We can safely // advance our zip iterator with n; currentPtrIndex += n; } } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } }; template <> class VectorPartyIterator<GeoPointT> : public thrust::iterator_adaptor< VectorPartyIterator<GeoPointT>, uint32_t *, thrust::tuple<GeoPointT, bool>, thrust::use_default, thrust::use_default, thrust::tuple<GeoPointT, bool>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor< VectorPartyIterator<GeoPointT>, uint32_t *, thrust::tuple<GeoPointT, bool>, thrust::use_default, thrust::use_default, thrust::tuple<GeoPointT, bool>, thrust::use_default> super_t; __host__ __device__ VectorPartyIterator() {} __host__ __device__ VectorPartyIterator( uint32_t *baseCounts, const uint32_t startCount, uint8_t *basePtr, uint32_t nullsOffset, uint32_t valuesOffset, uint32_t length, uint8_t stepInBytes, uint8_t nullBitOffset) : super_t(reinterpret_cast<uint32_t *>(startCount)), basePtr(basePtr), valuesOffset(valuesOffset), currentPtrIndex(0), nullBitOffset(nullBitOffset) { } private: uint8_t *basePtr; uint32_t currentPtrIndex; uint32_t valuesOffset; uint8_t nullBitOffset; __host__ __device__ bool get_null() const { // mode 1 if (valuesOffset == 0) { return true; } return get_bool(basePtr); } __host__ __device__ GeoPointT get_value() const { return reinterpret_cast<GeoPointT *>(basePtr+valuesOffset)[currentPtrIndex]; } __host__ __device__ bool get_bool(uint8_t *ptr) const { uint32_t nullIndex = currentPtrIndex + nullBitOffset; return ptr[nullIndex / 8] & (1 << nullIndex % 8); } __host__ __device__ thrust::tuple<GeoPointT, bool> dereference() const { return thrust::make_tuple(get_value(), get_null()); } __host__ __device__ void advance(typename super_t::difference_type n) { this->base_reference() = reinterpret_cast<uint32_t *>( reinterpret_cast<uintptr_t>(this->base_reference()) + n); currentPtrIndex += n; } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } }; template<typename Value> using ColumnIterator = thrust::permutation_iterator<VectorPartyIterator<Value>, uint32_t *>; // Helper function for creating VectorPartyIterator without specifying // Value template argument. template <class Value> ColumnIterator<Value> make_column_iterator( uint32_t *indexVector, uint32_t *baseCounts, uint32_t startCount, uint8_t *basePtr, uint32_t nullsOffset, uint32_t valuesOffset, uint32_t length, uint8_t stepInBytes, uint8_t nullBitOffset) { return thrust::make_permutation_iterator( VectorPartyIterator<Value>(baseCounts, startCount, basePtr, nullsOffset, valuesOffset, length, stepInBytes, nullBitOffset), indexVector); } ////////////////////////////////////////////////////////////// // Array VectorParty Iterator // The VectorParty Iterator for Array Column ///////////////////////////////////////////////////////////// template<typename Value> class ArrayVectorPartyIterator : public thrust::iterator_adaptor< ArrayVectorPartyIterator<Value>, uint64_t*, thrust::tuple<Value*, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value*, bool>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<ArrayVectorPartyIterator<Value>, uint64_t*, thrust::tuple<Value*, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value*, bool>, thrust::use_default> super_t; __host__ __device__ ArrayVectorPartyIterator() {} // base is counts vector if mode 0. nulls vector if mode 2, values vector // if mode 0. __host__ __device__ ArrayVectorPartyIterator( uint8_t *basePtr, uint32_t valueOffsetAdj, uint32_t length) : super_t(reinterpret_cast<uint64_t *>(basePtr)), basePtr(basePtr), valuePtr(basePtr + 8 * length - valueOffsetAdj), length(length) { } private: uint8_t *basePtr; uint8_t *valuePtr; uint32_t length; __host__ __device__ typename super_t::reference dereference() const { uint32_t offset = *(reinterpret_cast<uint32_t *>(this->base_reference())); uint32_t length = *(reinterpret_cast<uint32_t *>(this->base_reference())+1); int n = static_cast<int>(this->base_reference() - reinterpret_cast<uint64_t *>(basePtr)); if (length == 0) { return thrust::make_tuple(reinterpret_cast<Value*>(NULL), offset != 0); } return thrust::make_tuple( reinterpret_cast<Value*>(valuePtr + offset), true); } __host__ __device__ void advance(typename super_t::difference_type n) { this->base_reference() += n; } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } }; // Helper function for creating ArrayVectorPartyIterator template<typename Value> inline ArrayVectorPartyIterator<Value> make_array_column_iterator( uint8_t* basePtr, int valueOffsetAdj, uint32_t length) { return ArrayVectorPartyIterator<Value>(basePtr, valueOffsetAdj, length); } // SimpleIterator combines interators in 4 different cases: // 1. Constant value. // 2. Mode 0 column, equivalent to constant value. // 3. Scratch space. // // We will use a value pointer and null offset to represent values // and nulls. If values or nulls are missing, we will reuse the space // to store constant values. // Only 4 Value types are supported: Uint32, Int32, Float32 and Bool. // Each bool value take 1 byte. // We will use the first 4 bytes of the value pointer to store default value // and rest 4 bytes as the position of the iterator. template<typename Value> class SimpleIterator : public thrust::iterator_adaptor<SimpleIterator<Value>, Value *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<SimpleIterator<Value>, Value *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> super_t; __host__ __device__ SimpleIterator() {} __host__ __device__ SimpleIterator( Value *values, uint32_t nullsOffset, Value defaultValue, bool defaultNull ) : super_t(values != nullptr ? values : reinterpret_cast<Value *>( *reinterpret_cast<uintptr_t *>(&defaultValue) << 32)), useDefault(values == nullptr) { if (useDefault) { nulls.defaultNull = defaultNull; } else { nulls.nullsOffset = nullsOffset; } } private: union { uint32_t nullsOffset; bool defaultNull; } nulls; bool useDefault; __host__ __device__ thrust::tuple<Value, bool> dereference() const { if (useDefault) { uintptr_t val = reinterpret_cast<uintptr_t>(this->base_reference()) >> 32; return thrust::make_tuple(*reinterpret_cast<Value *>(&val), nulls.defaultNull); } return thrust::make_tuple(*this->base_reference(), *(reinterpret_cast<bool *>(this->base_reference()) + nulls.nullsOffset)); } __host__ __device__ void advance(typename super_t::difference_type n) { this->base_reference() += n; if (!useDefault) { nulls.nullsOffset += (1 - sizeof(Value)) * n; } } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } }; template <typename Value> struct ConstantIterator { typedef typename std::conditional< std::is_same<Value, UUIDT>::value || std::is_same<Value, int64_t>::value, thrust::constant_iterator<thrust::tuple<Value, bool>>, SimpleIterator<Value>>::type type; }; template <typename Value> inline typename ConstantIterator<Value>::type make_constant_iterator( Value defaultValue, bool defaultNull) { return SimpleIterator<Value>(nullptr, 0, defaultValue, defaultNull); } template <> inline typename ConstantIterator<int64_t>::type make_constant_iterator( int64_t defaultValue, bool defaultNull) { return thrust::make_constant_iterator( thrust::make_tuple<int64_t, bool>(defaultValue, defaultNull)); } template <> inline typename ConstantIterator<UUIDT>::type make_constant_iterator( UUIDT defaultValue, bool defaultNull) { return thrust::make_constant_iterator( thrust::make_tuple<UUIDT, bool>(defaultValue, defaultNull)); } using GeoSimpleIterator = thrust::constant_iterator<thrust::tuple<GeoPointT, bool>>; // DimensionOutputIterator is for writing individual dimension transform // results into a consecutive chunk of memory which later will be used // as the key for sort and reduce. template<typename Value> using DimensionOutputIterator = thrust::zip_iterator< thrust::tuple<thrust::detail::normal_iterator<Value *>, thrust::detail::normal_iterator<bool *> > >; template<typename Value> DimensionOutputIterator<Value> make_dimension_output_iterator( uint8_t *dimValues, uint8_t *dimNulls) { return thrust::make_zip_iterator( thrust::make_tuple(reinterpret_cast<Value *>(dimValues), reinterpret_cast<bool *>(dimNulls))); } template<typename Value> SimpleIterator<Value> make_scratch_space_input_iterator( uint8_t *valueIter, uint32_t nullOffset) { return SimpleIterator<Value>(reinterpret_cast<Value *>(valueIter), nullOffset, 0, 0); } template<> SimpleIterator<UUIDT> make_scratch_space_input_iterator<UUIDT>( uint8_t *valueIter, uint32_t nullOffset); template<> SimpleIterator<GeoPointT> make_scratch_space_input_iterator<GeoPointT>( uint8_t *valueIter, uint32_t nullOffset); template<typename Value> using ScratchSpaceOutputIterator = thrust::zip_iterator< thrust::tuple< thrust::detail::normal_iterator<Value *>, thrust::detail::normal_iterator<bool *> > >; // Helper function for creating ScratchSpaceIterator without specifying // Value template argument. template<typename Value> ScratchSpaceOutputIterator<Value> make_scratch_space_output_iterator( Value *v, uint32_t nullOffset) { return thrust::make_zip_iterator( thrust::make_tuple(v, reinterpret_cast<bool *>(v) + nullOffset)); } template<typename Value> class MeasureProxy { public: __host__ __device__ MeasureProxy(Value *outputIter, Value identity, uint32_t count, bool isAvg) : outputIter(outputIter), count(count), identity(identity), isAvg(isAvg) { } __host__ __device__ MeasureProxy operator=(thrust::tuple<Value, bool> t) const { if (!thrust::get<1>(t)) { *outputIter = identity; } else if (isAvg) { return assignAvg(t); } else { *outputIter = thrust::get<0>(t) * count; } return *this; } __host__ __device__ MeasureProxy assignAvg(thrust::tuple<Value, bool> t) const { // Each operand is 64bits where higher 32bits are used for count and // lower bits are for intermediate average. float_t v = thrust::get<0>(t); *reinterpret_cast<float_t *>(outputIter) = v; *(reinterpret_cast<uint32_t *>(outputIter) + 1) = count; return *this; } private: Value *outputIter; Value identity; uint32_t count; bool isAvg; }; // MeasureOutputIterator is the output iterator for the final step of measure // transformation and as the input for aggregation. It needs to take care // of 2 things: // 1. Given a <value, validity> tuple, write to a single value vector while // write null value as the identity value for different aggregation function // 2. If base column is a compressed column, also need to recover the original // value instead of the compressed value. e.g. if we have a value 3 with count 3 // and the aggregation function is sum. We need to write 3 * 3 to measure // output iterator. template<typename Value> class MeasureOutputIterator : public thrust::iterator_adaptor< MeasureOutputIterator<Value>, Value *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, MeasureProxy<Value>, thrust::use_default> { public: friend class thrust::iterator_core_access; // shorthand for the name of the iterator_adaptor we're deriving from. typedef thrust::iterator_adaptor<MeasureOutputIterator<Value>, Value *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, MeasureProxy<Value>, thrust::use_default> super_t; __host__ __device__ MeasureOutputIterator(Value *base, const uint32_t *baseCounts, const uint32_t *indexVector, AggregateFunction aggFunc) : super_t(base), baseCounts(baseCounts), indexVector(indexVector) { // We don't need to do anything for this value if it's not sum. if (!((aggFunc >= AGGR_SUM_UNSIGNED && aggFunc <= AGGR_SUM_FLOAT) || (aggFunc == AGGR_AVG_FLOAT))) { skipCount = true; } isAvg = aggFunc == AGGR_AVG_FLOAT; identity = get_identity_value<Value>(aggFunc); } private: const uint32_t *baseCounts; const uint32_t *indexVector; Value identity; bool skipCount = false; bool isAvg = false; __host__ __device__ typename super_t::reference dereference() const { uint32_t index = *indexVector; uint32_t count = !skipCount && baseCounts != nullptr ? baseCounts[index + 1] - baseCounts[index] : 1; return MeasureProxy<Value>(this->base_reference(), identity, count, isAvg); } __host__ __device__ void advance(typename super_t::difference_type n) { this->base_reference() += n; indexVector += n; } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } }; // Helper function for creating MeasureOutputIterator without specifying // Value template argument. template<typename Value> MeasureOutputIterator<Value> make_measure_output_iterator( Value *v, uint32_t *indexVector, uint32_t *baseCounts, AggregateFunction aggFunc) { return MeasureOutputIterator<Value>(v, baseCounts, indexVector, aggFunc); } class IndexProxy { public: __host__ __device__ explicit IndexProxy(uint32_t *outputIndex) : outputIndex(outputIndex) { } // Parameter is a tuple of <index row number, index value>. __host__ __device__ IndexProxy operator=(thrust::tuple<uint32_t, uint32_t> t) const { *outputIndex = thrust::get<1>(t); return *this; } private: uint32_t *outputIndex; }; // IndexOutputIterator is the output iterator for writing filtered index // to. The input will be a tuple of <index row number, index value>. We // will discard the index row number and only write the filtered index value // into the output index value vector which is a uint32_t pointer. class IndexOutputIterator : public thrust::iterator_adaptor<IndexOutputIterator, uint32_t *, thrust::tuple< uint32_t, uint32_t>, thrust::use_default, thrust::use_default, IndexProxy, thrust::use_default> { public: friend class thrust::iterator_core_access; // shorthand for the name of the iterator_adaptor we're deriving from. typedef thrust::iterator_adaptor<IndexOutputIterator, uint32_t *, thrust::tuple<uint32_t, uint32_t>, thrust::use_default, thrust::use_default, IndexProxy, thrust::use_default> super_t; __host__ __device__ explicit IndexOutputIterator(uint32_t *base) : super_t(base) { } private: __host__ __device__ typename super_t::reference dereference() const { return IndexProxy(this->base_reference()); } }; // The column mode for foreign tables are either 0 or 2, which is // presented by two different type of iterators: SimpleIterator and // VectorPartyIterator. And the mode of batch columns will be interleaving. // To make all iterators have the same size, we put both cases into a // single union and has a boolean flag to tell which iterator it is. template <typename Value> struct ForeignTableIterator { __host__ __device__ ForeignTableIterator() {} __host__ __device__ ForeignTableIterator(const ForeignTableIterator<Value> &other) { this->isConst = other.isConst; if (this->isConst) { this->iter.constantIter = other.iter.constantIter; } else { this->iter.columnIter = other.iter.columnIter; } } __host__ __device__ ForeignTableIterator &operator=( const ForeignTableIterator<Value> &other) { this->isConst = other.isConst; if (this->isConst) { this->iter.constantIter = other.iter.constantIter; } else { this->iter.columnIter = other.iter.columnIter; } return *this; } explicit ForeignTableIterator(VectorPartyIterator<Value> columnIter) : isConst(false) { iter.columnIter = columnIter; } explicit ForeignTableIterator( typename ConstantIterator<Value>::type constantIter) : isConst(true) { iter.constantIter = constantIter; } union ForeignBatchIter { VectorPartyIterator<Value> columnIter; typename ConstantIterator<Value>::type constantIter; __host__ __device__ ForeignBatchIter() {} } iter; bool isConst; }; // RecordIDJoinIterator reads the foreign table column. template<typename Value> class RecordIDJoinIterator : public thrust::iterator_adaptor< RecordIDJoinIterator<Value>, RecordID *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef ForeignTableIterator<Value> ValueIter; typedef thrust::iterator_adaptor< RecordIDJoinIterator<Value>, RecordID *, thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>, thrust::use_default> super_t; __host__ __device__ RecordIDJoinIterator( RecordID *base, int32_t numBatches, int32_t baseBatchID, ValueIter *batches, int32_t numRecordsInLastBatch, int16_t *timezoneLookupTable, int16_t timezoneLookupSize) : super_t(base), batches(batches), baseBatchID(baseBatchID), numBatches(numBatches), numRecordsInLastBatch(numRecordsInLastBatch), timezoneLookupTable(timezoneLookupTable), timezoneLookupSize(timezoneLookupSize) { } private: ValueIter *batches; int32_t baseBatchID; int32_t numBatches; int32_t numRecordsInLastBatch; int16_t *timezoneLookupTable; int16_t timezoneLookupSize; __host__ __device__ thrust::tuple<GeoPointT, bool> timezoneLookup( thrust::tuple<GeoPointT, bool> val) const { return val; } __host__ __device__ thrust::tuple<UUIDT, bool> timezoneLookup( thrust::tuple<UUIDT, bool> val) const { return val; } template<typename V> __host__ __device__ thrust::tuple<V, bool> timezoneLookup( thrust::tuple<V, bool> val) const { if (timezoneLookupTable) { // if timezoneLookup is not null, we must be iterating a enum column int enumVal = static_cast<int>(thrust::get<0>(val)); if (enumVal < timezoneLookupSize) { return thrust::make_tuple( timezoneLookupTable[enumVal], thrust::get<1>(val)); } else { return thrust::make_tuple(0, thrust::get<1>(val)); } } return val; } __host__ __device__ typename super_t::reference dereference() const { RecordID recordID = *this->base_reference(); if (recordID.batchID && (recordID.batchID - baseBatchID < numBatches - 1 || recordID.index < numRecordsInLastBatch)) { ForeignTableIterator<Value> iter = batches[recordID.batchID - baseBatchID]; if (iter.isConst) { return iter.iter.constantIter[recordID.index]; } thrust::tuple<Value, bool> val = iter.iter.columnIter[recordID.index]; return timezoneLookup(val); } Value defaultValue; return thrust::make_tuple(defaultValue, false); } }; // DimensionHashIterator reads DimensionVector and produce hashed values template<int hash_bytes = 64, typename IndexVectorType = uint32_t *> class DimensionHashIterator : public thrust::iterator_adaptor< DimensionHashIterator<hash_bytes, IndexVectorType>, IndexVectorType, typename hash_output_type<hash_bytes>::type, thrust::use_default, thrust::use_default, typename hash_output_type<hash_bytes>::type, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor< DimensionHashIterator<hash_bytes, IndexVectorType>, IndexVectorType, typename hash_output_type<hash_bytes>::type, thrust::use_default, thrust::use_default, typename hash_output_type<hash_bytes>::type, thrust::use_default> super_t; __host__ __device__ DimensionHashIterator(uint8_t *dimValues, uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH], int length, IndexVectorType indexVector = IndexVectorType(0)) : super_t(indexVector), dimValues(dimValues), length(length) { totalNumDims = 0; rowBytes = 0; for (int i = 0; i < NUM_DIM_WIDTH; i++) { numDimsPerDimWidth[i] = _numDimsPerDimWidth[i]; totalNumDims += _numDimsPerDimWidth[i]; uint8_t dimBytes = 1 << (NUM_DIM_WIDTH - 1 - i); rowBytes += dimBytes * numDimsPerDimWidth[i]; } nullValues = dimValues + rowBytes * length; // include null values, 1 byte per each dim rowBytes += totalNumDims; } private: uint8_t *dimValues; uint8_t *nullValues; uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH]; uint8_t totalNumDims; uint8_t rowBytes; int length; __host__ __device__ typename super_t::reference dereference() const { uint32_t index = *this->base_reference(); uint8_t dimRow[MAX_DIMENSION_BYTES] = {0}; // read from uint8_t *inputValueStart = dimValues; uint8_t *inputNullStart = nullValues; // write to uint8_t *outputValuePtr = dimRow; uint8_t *outputNullPtr = outputValuePtr + (rowBytes - totalNumDims); uint8_t numDims = 0; for (int i = 0; i < NUM_DIM_WIDTH; i++) { uint8_t dimBytes = 1 << (NUM_DIM_WIDTH - 1 - i); for (int j = numDims; j < numDims + numDimsPerDimWidth[i]; j++) { switch (dimBytes) { case 16: *reinterpret_cast<UUIDT *>(outputValuePtr) = reinterpret_cast<UUIDT *>(inputValueStart)[index]; break; case 8: *reinterpret_cast<uint64_t *>(outputValuePtr) = reinterpret_cast<uint64_t *>(inputValueStart)[index]; break; case 4: *reinterpret_cast<uint32_t *>(outputValuePtr) = reinterpret_cast<uint32_t *>(inputValueStart)[index]; break; case 2: *reinterpret_cast<uint16_t *>(outputValuePtr) = reinterpret_cast<uint16_t *>(inputValueStart)[index]; break; case 1: *outputValuePtr = inputValueStart[index]; break; } outputValuePtr += dimBytes; inputValueStart += dimBytes * length; *outputNullPtr = inputNullStart[index]; outputNullPtr++; inputNullStart += length; } numDims += numDimsPerDimWidth[i]; } return murmur3sum<hash_bytes>(dimRow, rowBytes, 0); } }; class DimValueProxy { public: __host__ __device__ DimValueProxy(uint8_t *ptr, int dimBytes) : ptr(ptr), dimBytes(static_cast<uint16_t >(dimBytes)) {} __host__ __device__ DimValueProxy operator=(DimValueProxy t) { setDimValue(ptr, t.ptr, dimBytes); return *this; } private: uint8_t *ptr; uint16_t dimBytes; }; class DimensionColumnPermutateIterator : public thrust::iterator_adaptor<DimensionColumnPermutateIterator, uint32_t *, DimValueProxy, thrust::use_default, thrust::use_default, DimValueProxy, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<DimensionColumnPermutateIterator, uint32_t *, DimValueProxy, thrust::use_default, thrust::use_default, DimValueProxy, thrust::use_default> super_t; __host__ __device__ DimensionColumnPermutateIterator( uint8_t *values, uint32_t *indexVector, int dimInputLength, int dimOutputLength, uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH]) : super_t(indexVector), begin(indexVector), values(values), dimInputLength(dimInputLength), dimOutputLength(dimOutputLength) { for (int i = 0; i < NUM_DIM_WIDTH; i++) { numDimsPerDimWidth[i] = _numDimsPerDimWidth[i]; } } private: uint8_t *values; uint32_t *begin; int dimInputLength; int dimOutputLength; uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH]; __host__ __device__ typename super_t::reference dereference() const { int baseIndex = this->base_reference() - begin; int dimIndex = baseIndex / dimOutputLength; // index in current dimension vector int localIndex = *(begin + (baseIndex % dimOutputLength)); int bytes = 0; uint8_t numDims = 0; uint8_t dimBytes = 0; int i = 0; for (; i < NUM_DIM_WIDTH; i++) { dimBytes = 1 << (NUM_DIM_WIDTH - i - 1); if (dimIndex < numDims + numDimsPerDimWidth[i]) { bytes += ((dimIndex - numDims) * dimInputLength + localIndex) * dimBytes; break; } else { bytes += numDimsPerDimWidth[i] * dimInputLength * dimBytes; } numDims += numDimsPerDimWidth[i]; } // null vector if (i == NUM_DIM_WIDTH) { bytes += ((dimIndex - numDims) * dimInputLength + localIndex) * dimBytes; return DimValueProxy(values + bytes, dimBytes); } return DimValueProxy(values + bytes, dimBytes); } }; // DimensionColumnOutputIterator output dimension of dimOutputLength rows into // a bigger dimension space which can hold capacity rows, the smallest unit of // the output is one single dimension column class DimensionColumnOutputIterator : public thrust::iterator_adaptor< DimensionColumnOutputIterator, thrust::counting_iterator<int>, DimValueProxy, thrust::use_default, thrust::use_default, DimValueProxy, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor< DimensionColumnOutputIterator, thrust::counting_iterator<int>, DimValueProxy, thrust::use_default, thrust::use_default, DimValueProxy, thrust::use_default> super_t; __host__ __device__ DimensionColumnOutputIterator( uint8_t *values, int capacity, int dimOutputLength, uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH], int offset) : super_t(thrust::make_counting_iterator<int>(0)), values(values), dimOutputLength(dimOutputLength), capacity(capacity), offset(offset) { for (int i = 0; i < NUM_DIM_WIDTH; i++) { numDimsPerDimWidth[i] = _numDimsPerDimWidth[i]; } } private: uint8_t *values; int capacity; int dimOutputLength; uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH]; // the start row to write output data, this is used to append operation int offset; __host__ __device__ typename super_t::reference dereference() const { int baseIndex = *this->base_reference(); int dimIndex = baseIndex / dimOutputLength; int globalIndex = baseIndex + (capacity - dimOutputLength) * dimIndex + offset; uint8_t numDims = 0; int bytes = 0; uint8_t dimBytes = 0; int i = 0; for (; i < NUM_DIM_WIDTH; i++) { dimBytes = 1 << (NUM_DIM_WIDTH - i - 1); if (dimIndex < numDims + numDimsPerDimWidth[i]) { bytes += (globalIndex - numDims * capacity) * dimBytes; break; } else { bytes += numDimsPerDimWidth[i] * capacity * dimBytes; } numDims += numDimsPerDimWidth[i]; } if (i == NUM_DIM_WIDTH) { bytes += (globalIndex - numDims * capacity) * dimBytes; return DimValueProxy(values + bytes, dimBytes); } return DimValueProxy(values + bytes, dimBytes); } }; // HLLRegIDHeadFlagIterator class HLLRegIDHeadFlagIterator : public thrust::iterator_adaptor< HLLRegIDHeadFlagIterator, thrust::counting_iterator<int>, unsigned int, thrust::use_default, thrust::use_default, unsigned int, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<HLLRegIDHeadFlagIterator, thrust::counting_iterator<int>, unsigned int, thrust::use_default, thrust::use_default, unsigned int, thrust::use_default> super_t; __host__ __device__ HLLRegIDHeadFlagIterator(uint64_t *hashValues) : super_t(thrust::make_counting_iterator(0)), hashValues(hashValues) {} private: uint64_t *hashValues; __host__ __device__ typename super_t::reference dereference() const { int index = *this->base_reference(); return (index == 0 || hashValues[index] != hashValues[index - 1]); } }; // HLLValueOutputIterator class HLLValueOutputIterator : public thrust::iterator_adaptor< HLLValueOutputIterator, thrust::counting_iterator<int>, thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default, thrust::use_default, thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor< HLLValueOutputIterator, thrust::counting_iterator<int>, thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default, thrust::use_default, thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default> super_t; __host__ __device__ HLLValueOutputIterator(unsigned int *dimCount, uint32_t *hllMeasureValues, uint64_t *hllRegIDCumCount, uint64_t *hllDimRegIDCumCount, uint64_t *hllVectorOffsets) : super_t(thrust::make_counting_iterator(0)), dimCount(dimCount), hllMeasureValues(hllMeasureValues), hllRegIDCumCount(hllRegIDCumCount), hllDimRegIDCumCount(hllDimRegIDCumCount), hllVectorOffsets(hllVectorOffsets) {} private: unsigned int *dimCount; uint32_t *hllMeasureValues; uint64_t *hllRegIDCumCount; uint64_t *hllDimRegIDCumCount; uint64_t *hllVectorOffsets; // get the reg id within dim group // index: 0 1 2 3 4 5 6 7 8 9 10 11 // regHeadIter: 1 0 0 0 1 0 0 0 1 0 0 0 // hllRegIDCumCount: 1 1 1 1 2 2 2 2 3 3 3 3 // dimCount: 1 1 1 1 1 1 1 1 2 2 2 2 // hllDimRegIDCumCount: 0 2 3 // hllVectorOffsets: 0 8 12 __host__ __device__ typename super_t::reference dereference() const { int index = *this->base_reference(); unsigned int dimIndex = dimCount[index] - 1; int dimRegIDCount = hllDimRegIDCumCount[dimIndex + 1] - hllDimRegIDCumCount[dimIndex]; uint64_t hllVectorOffset = hllVectorOffsets[dimIndex]; if (dimRegIDCount < HLL_DENSE_THRESHOLD) { // sparse mode uint64_t regIDCumCount = hllRegIDCumCount[index]; uint64_t dimRegIDcumCount = hllDimRegIDCumCount[dimIndex]; int regIDIndexWithinDim = regIDCumCount - dimRegIDcumCount - 1; return thrust::make_tuple(hllVectorOffset + regIDIndexWithinDim * 4, hllMeasureValues[index], 4); } else { // dense mode int regID = hllMeasureValues[index] & 0x3FFF; return thrust::make_tuple(hllVectorOffset + regID, hllMeasureValues[index], 1); } } }; // GeoPredicateIterator reads geo intersection output predicate vector. // It output the first intersected geoshape index, or -1 if no intersected // geoshape found. class GeoPredicateIterator : public thrust::iterator_adaptor<GeoPredicateIterator, uint32_t *, int8_t, thrust::use_default, thrust::use_default, int8_t, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<GeoPredicateIterator, uint32_t *, int8_t, thrust::use_default, thrust::use_default, int8_t, thrust::use_default> super_t; __host__ __device__ GeoPredicateIterator(uint32_t *predicateIter, uint8_t stepInWords) : super_t(predicateIter), stepInWords(stepInWords) {} private: uint8_t stepInWords; __host__ __device__ int8_t get_first_none_zero(uint32_t word) const { int i = 0; while (i < 32) { if ((word >> i) & 1) { return i; } i++; } return -1; } __host__ __device__ typename super_t::reference dereference() const { for (int i = 0; i < stepInWords; i++) { int8_t index = get_first_none_zero(this->base_reference()[i]); if (index >= 0) { return (int8_t)(i * 32 + index); } } return -1; } __host__ __device__ void advance(typename super_t::difference_type n) { this->base_reference() += n * stepInWords; } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } __host__ __device__ typename super_t::difference_type distance_to( const GeoPredicateIterator &other) const { typename super_t::difference_type dist = other.base_reference() - this->base_reference(); return dist / stepInWords; } }; // Used as void value and reference type. struct EmptyStruct {}; // GeoBatchIntersectIterator is the iterator to compute whether the // semi-infinite ray horizontally emitted from the geo point crosses a single // edge of the geoshape. Note value and dereference type of this iterator are // all default which means accessing and assigning value to this iterator has // no meaning. The dereference function will directly write to the output // predicate vector using atomicXor. template <typename GeoInputIterator> class GeoBatchIntersectIterator : public thrust::iterator_adaptor< GeoBatchIntersectIterator<GeoInputIterator>, GeoInputIterator, EmptyStruct, thrust::use_default, thrust::use_default, EmptyStruct, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<GeoBatchIntersectIterator<GeoInputIterator>, GeoInputIterator, EmptyStruct, thrust::use_default, thrust::use_default, EmptyStruct, thrust::use_default> super_t; __host__ __device__ GeoBatchIntersectIterator() {} __host__ __device__ GeoBatchIntersectIterator( GeoInputIterator geoPoints, GeoShapeBatch geoShapes, uint32_t *outputPredicate, bool inOrOut) : super_t(geoPoints), geoShapes(geoShapes), outputPredicate(outputPredicate), pointIndex(0), inOrOut(inOrOut) {} private: GeoShapeBatch geoShapes; uint32_t *outputPredicate; int32_t pointIndex; bool inOrOut; __host_or_device__ typename super_t::reference dereference() const { // offset to shapeVector in Bytes is totalNumberPoints * 2 * 4 uint8_t shapeIndex = geoShapes.LatLongs[geoShapes.TotalNumPoints * 2 * 4 + pointIndex]; EmptyStruct emptyRes; // nothing need to be done for the last point of a shape. if (pointIndex >= geoShapes.TotalNumPoints - 1) { return emptyRes; } // shapeIndex change marks the last point of a shape, therefore nothing // needs to be done if (shapeIndex != geoShapes.LatLongs[geoShapes.TotalNumPoints * 2 * 4 + pointIndex + 1]) { return emptyRes; } auto testPoint = *this->base_reference(); // testPoint is null, we just write false to the output predicate. if (!thrust::get<1>(testPoint)) { // only first pointer is responsible for write. if (pointIndex == 0) { for (int i = 0; i < geoShapes.TotalWords; i++) { outputPredicate[i] = !inOrOut; } } return emptyRes; } float testLat = thrust::get<0>(testPoint).Lat; float testLong = thrust::get<0>(testPoint).Long; // the latitude of first point of the edge. float edgeLat1 = reinterpret_cast<float *>(geoShapes.LatLongs)[pointIndex]; // the latitude of second point of the edge. float edgeLat2 = reinterpret_cast<float *>(geoShapes.LatLongs)[pointIndex + 1]; if (edgeLat1 < FLT_MAX && edgeLat2 < FLT_MAX) { float edgeLong1 = reinterpret_cast<float *>( geoShapes.LatLongs)[geoShapes.TotalNumPoints + pointIndex]; float edgeLong2 = reinterpret_cast<float *>( geoShapes.LatLongs)[geoShapes.TotalNumPoints + pointIndex + 1]; if (((edgeLong1 > testLong) != (edgeLong2 > testLong)) && (testLat < (edgeLat2 - edgeLat1) * (testLong - edgeLong1) / (edgeLong2 - edgeLong1) + edgeLat1)) { #ifdef RUN_ON_DEVICE atomicXor(outputPredicate + (shapeIndex / 32), (1 << (shapeIndex % 32))); #else // When we are running in host mode, we are running sequentially. // So non atomic access is ok for now. // If we switch to parallel host execution in future, // we need to find out how to do atomic operation on an existing // pointer in c++. outputPredicate[shapeIndex / 32] ^= (1 << (shapeIndex % 32)); #endif } } return emptyRes; } __host__ __device__ void advance(typename super_t::difference_type n) { int64_t newPointIndex = (int64_t)pointIndex + n; if (newPointIndex >= geoShapes.TotalNumPoints) { int steps = newPointIndex / geoShapes.TotalNumPoints; this->base_reference() += steps; pointIndex = newPointIndex %= geoShapes.TotalNumPoints; outputPredicate += steps * geoShapes.TotalWords; } else if (newPointIndex < 0) { int steps = (newPointIndex - geoShapes.TotalNumPoints + 1) / geoShapes.TotalNumPoints; this->base_reference() += steps; pointIndex = newPointIndex -= steps * geoShapes.TotalNumPoints; outputPredicate += steps * geoShapes.TotalWords; } else { pointIndex = (int32_t)newPointIndex; } } __host__ __device__ void increment() { advance(1); } __host__ __device__ void decrement() { advance(-1); } __host__ __device__ typename super_t::difference_type distance_to( const GeoBatchIntersectIterator &other) const { typename super_t::difference_type dist = other.base_reference() - this->base_reference(); return dist * geoShapes.TotalNumPoints + (other.pointIndex - this->pointIndex); } }; template<typename GeoInputIterator> GeoBatchIntersectIterator<GeoInputIterator> make_geo_batch_intersect_iterator( GeoInputIterator points, GeoShapeBatch geoShape, uint32_t *outputPredicate, bool inOrOut) { return GeoBatchIntersectIterator<GeoInputIterator>(points, geoShape, outputPredicate, inOrOut); } // Iterator to retrieve counts on index, which is usually for // mode 3 archive vectorparty class IndexCountIterator : public thrust::iterator_adaptor<IndexCountIterator, uint32_t *, uint32_t, thrust::use_default, thrust::use_default, uint32_t, thrust::use_default> { public: friend class thrust::iterator_core_access; typedef thrust::iterator_adaptor<IndexCountIterator, uint32_t *, uint32_t, thrust::use_default, thrust::use_default, uint32_t, thrust::use_default> super_t; __host__ __device__ IndexCountIterator() {} IndexCountIterator(uint32_t *baseCount, uint32_t *indexVector) : super_t(indexVector), baseCount(baseCount) {} private: uint32_t *baseCount; __host__ __device__ uint32_t dereference() const { return baseCount[*this->base_reference()+1] - baseCount[*this->base_reference()]; } }; } // namespace ares namespace thrust { namespace detail { // For execution policy with cuda, it requires the output iterator's reference // type to be a actual reference. However when using MeasureProxy and // IndexProxy, they are value types. So we need to specialize the // is_non_const_reference to traits to treat all MeasureProxy template classes // as reference type. template<> struct is_non_const_reference< ares::MeasureProxy<int32_t> > : public true_type { }; template<> struct is_non_const_reference< ares::MeasureProxy<uint32_t> > : public true_type { }; template<> struct is_non_const_reference< ares::MeasureProxy<float_t> > : public true_type { }; template<> struct is_non_const_reference< ares::MeasureProxy<int64_t> > : public true_type { }; template<> struct is_non_const_reference< ares::MeasureProxy<double_t> > : public true_type { }; template<> struct is_non_const_reference<ares::IndexProxy> : public true_type { }; template<> struct is_non_const_reference<ares::DimValueProxy> : public true_type { }; } // namespace detail } // namespace thrust #endif // QUERY_ITERATOR_HPP_

query/iterator.hpp (1,195 lines of code) (raw):