in parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexValidator.java [554:625]
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
List<ContractViolation> violations = new ArrayList<>();
try (ParquetFileReader reader = ParquetFileReader.open(file)) {
FileMetaData meta = reader.getFooter().getFileMetaData();
MessageType schema = meta.getSchema();
List<ColumnDescriptor> columns = schema.getColumns();
List<BlockMetaData> blocks = reader.getFooter().getBlocks();
int rowGroupNumber = 0;
PageReadStore rowGroup = reader.readNextRowGroup();
while (rowGroup != null) {
ColumnReadStore columnReadStore = new ColumnReadStoreImpl(
rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
List<ColumnChunkMetaData> columnChunks =
blocks.get(rowGroupNumber).getColumns();
assert (columnChunks.size() == columns.size());
for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
ColumnDescriptor column = columns.get(columnNumber);
ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
if (columnIndex == null) {
continue;
}
ColumnPath columnPath = columnChunk.getPath();
OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
List<ByteBuffer> minValues = columnIndex.getMinValues();
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
List<Long> nullCounts = columnIndex.getNullCounts();
List<Boolean> nullPages = columnIndex.getNullPages();
long rowNumber = 0;
ColumnReader columnReader = columnReadStore.getColumnReader(column);
ByteBuffer prevMinValue = null;
ByteBuffer prevMaxValue = null;
for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
boolean isNullPage = nullPages.get(pageNumber);
ByteBuffer minValue = minValues.get(pageNumber);
ByteBuffer maxValue = maxValues.get(pageNumber);
PageValidator pageValidator = new PageValidator(
column.getPrimitiveType(),
rowGroupNumber,
columnNumber,
columnPath,
pageNumber,
violations,
columnReader,
minValue,
maxValue,
prevMinValue,
prevMaxValue,
boundaryOrder,
nullCounts.get(pageNumber),
isNullPage);
if (!isNullPage) {
prevMinValue = minValue;
prevMaxValue = maxValue;
}
long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
while (rowNumber <= lastRowNumberInPage) {
pageValidator.validateValuesBelongingToRow();
++rowNumber;
}
pageValidator.finishPage();
}
}
rowGroup.close();
rowGroup = reader.readNextRowGroup();
rowGroupNumber++;
}
}
return violations;
}