in velox/dwio/dwrf/reader/ColumnReader.cpp [1102:1233]
void StringDictionaryColumnReader::readDictionaryVector(
uint64_t numValues,
VectorPtr& result,
const uint64_t* incomingNulls) {
auto dictVector =
detail::resetIfWrongVectorType<DictionaryVector<StringView>>(result);
BufferPtr indices;
if (dictVector) {
indices = dictVector->mutableIndices(numValues);
}
BufferPtr nulls = readNulls(numValues, result, incomingNulls);
const auto* nullsPtr = nulls ? nulls->as<uint64_t>() : nullptr;
uint64_t nullCount = nullsPtr ? bits::countNulls(nullsPtr, 0, numValues) : 0;
if (result) {
detail::resetIfNotWritable(result, indices);
}
if (!indices) {
indices = AlignedBuffer::allocate<vector_size_t>(numValues, &memoryPool_);
}
auto indicesPtr = indices->asMutable<vector_size_t>();
dictIndex->nextInts(indicesPtr, numValues, nullsPtr);
indices->setSize(numValues * sizeof(vector_size_t));
bool hasStrideDict = false;
// load inDictionary
const char* inDictPtr = nullptr;
if (inDictionaryReader) {
detail::ensureCapacity<bool>(inDict, numValues, &memoryPool_);
inDictionaryReader->next(inDict->asMutable<char>(), numValues, nullsPtr);
inDictPtr = inDict->as<char>();
}
if (nulls) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!bits::isBitNull(nullsPtr, i)) {
if (!inDictPtr || bits::isBitSet(inDictPtr, i)) {
// points to an entry in rowgroup dictionary
} else {
// points to an entry in stride dictionary
indicesPtr[i] += dictionaryCount;
hasStrideDict = true;
}
}
}
} else {
for (uint64_t i = 0; i < numValues; ++i) {
if (!inDictPtr || bits::isBitSet(inDictPtr, i)) {
// points to an entry in rowgroup dictionary
} else {
// points to an entry in stride dictionary
indicesPtr[i] += dictionaryCount;
hasStrideDict = true;
}
}
}
VectorPtr dictionaryValues;
const auto* dictionaryBlobPtr = dictionaryBlob->as<char>();
const auto* dictionaryOffsetsPtr = dictionaryOffset->as<int64_t>();
if (hasStrideDict) {
if (!combinedDictionaryValues_) {
// TODO Reuse memory
BufferPtr values = AlignedBuffer::allocate<StringView>(
dictionaryCount + strideDictCount, &memoryPool_);
auto* valuesPtr = values->asMutable<StringView>();
for (size_t i = 0; i < dictionaryCount; i++) {
valuesPtr[i] = StringView(
dictionaryBlobPtr + dictionaryOffsetsPtr[i],
dictionaryOffsetsPtr[i + 1] - dictionaryOffsetsPtr[i]);
}
const auto* strideDictPtr = strideDict->as<char>();
const auto* strideDictOffsetPtr = strideDictOffset->as<int64_t>();
for (size_t i = 0; i < strideDictCount; i++) {
valuesPtr[dictionaryCount + i] = StringView(
strideDictPtr + strideDictOffsetPtr[i],
strideDictOffsetPtr[i + 1] - strideDictOffsetPtr[i]);
}
combinedDictionaryValues_ = std::make_shared<FlatVector<StringView>>(
&memoryPool_,
nodeType_->type,
BufferPtr(nullptr), // TODO nulls
dictionaryCount + strideDictCount /*length*/,
values,
std::vector<BufferPtr>{dictionaryBlob, strideDict});
}
dictionaryValues = combinedDictionaryValues_;
} else {
if (!dictionaryValues_) {
// TODO Reuse memory
BufferPtr values =
AlignedBuffer::allocate<StringView>(dictionaryCount, &memoryPool_);
auto* valuesPtr = values->asMutable<StringView>();
for (size_t i = 0; i < dictionaryCount; i++) {
valuesPtr[i] = StringView(
dictionaryBlobPtr + dictionaryOffsetsPtr[i],
dictionaryOffsetsPtr[i + 1] - dictionaryOffsetsPtr[i]);
}
dictionaryValues_ = std::make_shared<FlatVector<StringView>>(
&memoryPool_,
nodeType_->type,
BufferPtr(nullptr), // TODO nulls
dictionaryCount /*length*/,
values,
std::vector<BufferPtr>{dictionaryBlob});
}
dictionaryValues = dictionaryValues_;
}
if (result) {
result->setSize(numValues);
result->setNullCount(nullCount);
result->as<DictionaryVector<StringView>>()->setDictionaryValues(
dictionaryValues);
} else {
result = std::make_shared<DictionaryVector<StringView>>(
&memoryPool_,
nulls,
numValues,
dictionaryValues,
TypeKind::INTEGER,
indices);
result->setNullCount(nullCount);
}
}