in c3r-sdk-parquet/src/main/java/com/amazonaws/c3r/io/ParquetRowReader.java [123:153]
private ParquetRowReader(@NonNull final String sourceName,
final boolean skipHeaderNormalization,
final Boolean binaryAsString) {
this.sourceName = sourceName;
this.binaryAsString = binaryAsString;
final var conf = new org.apache.hadoop.conf.Configuration();
final org.apache.hadoop.fs.Path file = new org.apache.hadoop.fs.Path(sourceName);
try {
fileReader = ParquetFileReader.open(HadoopInputFile.fromPath(file, conf));
} catch (FileNotFoundException e) {
throw new C3rRuntimeException("Unable to find file " + sourceName + ".", e);
} catch (IOException | RuntimeException e) {
throw new C3rRuntimeException("Error reading from file " + sourceName + ".", e);
}
parquetSchema = ParquetSchema.builder()
.messageType(fileReader.getFooter().getFileMetaData().getSchema())
.skipHeaderNormalization(skipHeaderNormalization)
.binaryAsString(binaryAsString)
.build();
if (parquetSchema.getHeaders().size() > MAX_COLUMN_COUNT) {
throw new C3rRuntimeException("Couldn't parse input file. Please verify that column count does not exceed " + MAX_COLUMN_COUNT
+ ".");
}
final Map<ColumnHeader, ParquetDataType> columnTypeMap = parquetSchema.getHeaders().stream()
.collect(Collectors.toMap(Function.identity(), parquetSchema::getColumnType));
valueFactory = new ParquetValueFactory(columnTypeMap);
refreshNextRow();
}