in parquet/src/column/writer/mod.rs [778:865]
fn update_column_offset_index(
&mut self,
page_statistics: Option<&ValueStatistics<E::T>>,
page_variable_length_bytes: Option<i64>,
) {
// update the column index
let null_page =
(self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls;
// a page contains only null values,
// and writers have to set the corresponding entries in min_values and max_values to byte[0]
if null_page && self.column_index_builder.valid() {
self.column_index_builder.append(
null_page,
vec![],
vec![],
self.page_metrics.num_page_nulls as i64,
);
} else if self.column_index_builder.valid() {
// from page statistics
// If can't get the page statistics, ignore this column/offset index for this column chunk
match &page_statistics {
None => {
self.column_index_builder.to_invalid();
}
Some(stat) => {
// Check if min/max are still ascending/descending across pages
let new_min = stat.min_opt().unwrap();
let new_max = stat.max_opt().unwrap();
if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max {
if self.data_page_boundary_ascending {
// If last min/max are greater than new min/max then not ascending anymore
let not_ascending = compare_greater(&self.descr, last_min, new_min)
|| compare_greater(&self.descr, last_max, new_max);
if not_ascending {
self.data_page_boundary_ascending = false;
}
}
if self.data_page_boundary_descending {
// If new min/max are greater than last min/max then not descending anymore
let not_descending = compare_greater(&self.descr, new_min, last_min)
|| compare_greater(&self.descr, new_max, last_max);
if not_descending {
self.data_page_boundary_descending = false;
}
}
}
self.last_non_null_data_page_min_max = Some((new_min.clone(), new_max.clone()));
if self.can_truncate_value() {
self.column_index_builder.append(
null_page,
self.truncate_min_value(
self.props.column_index_truncate_length(),
stat.min_bytes_opt().unwrap(),
)
.0,
self.truncate_max_value(
self.props.column_index_truncate_length(),
stat.max_bytes_opt().unwrap(),
)
.0,
self.page_metrics.num_page_nulls as i64,
);
} else {
self.column_index_builder.append(
null_page,
stat.min_bytes_opt().unwrap().to_vec(),
stat.max_bytes_opt().unwrap().to_vec(),
self.page_metrics.num_page_nulls as i64,
);
}
}
}
}
// Append page histograms to the `ColumnIndex` histograms
self.column_index_builder.append_histograms(
&self.page_metrics.repetition_level_histogram,
&self.page_metrics.definition_level_histogram,
);
// Update the offset index
if let Some(builder) = self.offset_index_builder.as_mut() {
builder.append_row_count(self.page_metrics.num_buffered_rows as i64);
builder.append_unencoded_byte_array_data_bytes(page_variable_length_bytes);
}
}