in src/parquet/column_reader.h [406:474]
inline int64_t TypedColumnReader<DType>::ReadBatchSpaced(
int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
int64_t* values_read, int64_t* null_count_out) {
// HasNext invokes ReadNewPage
if (!HasNext()) {
*levels_read = 0;
*values_read = 0;
*null_count_out = 0;
return 0;
}
int64_t total_values;
// TODO(wesm): keep reading data pages until batch_size is reached, or the
// row group is finished
batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_);
// If the field is required and non-repeated, there are no definition levels
if (descr_->max_definition_level() > 0) {
int64_t num_def_levels = ReadDefinitionLevels(batch_size, def_levels);
// Not present for non-repeated fields
if (descr_->max_repetition_level() > 0) {
int64_t num_rep_levels = ReadRepetitionLevels(batch_size, rep_levels);
if (num_def_levels != num_rep_levels) {
throw ParquetException("Number of decoded rep / def levels did not match");
}
}
const bool has_spaced_values = internal::HasSpacedValues(descr_);
int64_t null_count = 0;
if (!has_spaced_values) {
int values_to_read = 0;
for (int64_t i = 0; i < num_def_levels; ++i) {
if (def_levels[i] == descr_->max_definition_level()) {
++values_to_read;
}
}
total_values = ReadValues(values_to_read, values);
for (int64_t i = 0; i < total_values; i++) {
::arrow::BitUtil::SetBit(valid_bits, valid_bits_offset + i);
}
*values_read = total_values;
} else {
int16_t max_definition_level = descr_->max_definition_level();
int16_t max_repetition_level = descr_->max_repetition_level();
internal::DefinitionLevelsToBitmap(def_levels, num_def_levels, max_definition_level,
max_repetition_level, values_read, &null_count,
valid_bits, valid_bits_offset);
total_values = ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
valid_bits, valid_bits_offset);
}
*levels_read = num_def_levels;
*null_count_out = null_count;
} else {
// Required field, read all values
total_values = ReadValues(batch_size, values);
for (int64_t i = 0; i < total_values; i++) {
::arrow::BitUtil::SetBit(valid_bits, valid_bits_offset + i);
}
*null_count_out = 0;
*levels_read = total_values;
}
ConsumeBufferedValues(*levels_read);
return total_values;
}