in parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java [1669:1816]
public ParquetMetadata fromParquetMetadata(
FileMetaData parquetMetadata,
InternalFileDecryptor fileDecryptor,
boolean encryptedFooter,
Map<RowGroup, Long> rowGroupToRowIndexOffsetMap)
throws IOException {
MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
List<RowGroup> row_groups = parquetMetadata.getRow_groups();
if (row_groups != null) {
for (RowGroup rowGroup : row_groups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
if (rowGroupToRowIndexOffsetMap.containsKey(rowGroup)) {
blockMetaData.setRowIndexOffset(rowGroupToRowIndexOffsetMap.get(rowGroup));
}
// not set in legacy files
if (rowGroup.isSetOrdinal()) {
blockMetaData.setOrdinal(rowGroup.getOrdinal());
}
List<ColumnChunk> columns = rowGroup.getColumns();
String filePath = columns.get(0).getFile_path();
int columnOrdinal = -1;
for (ColumnChunk columnChunk : columns) {
columnOrdinal++;
if ((filePath == null && columnChunk.getFile_path() != null)
|| (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
throw new ParquetDecodingException(
"all column chunks of the same row group must be in the same file for now");
}
ColumnMetaData metaData = columnChunk.meta_data;
ColumnCryptoMetaData cryptoMetaData = columnChunk.getCrypto_metadata();
ColumnChunkMetaData column = null;
ColumnPath columnPath = null;
boolean lazyMetadataDecryption = false;
if (null == cryptoMetaData) { // Plaintext column
columnPath = getPath(metaData);
if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
// mark this column as plaintext in encrypted file decryptor
fileDecryptor.setColumnCryptoMetadata(
columnPath, false, false, (byte[]) null, columnOrdinal);
}
} else { // Encrypted column
boolean encryptedWithFooterKey = cryptoMetaData.isSetENCRYPTION_WITH_FOOTER_KEY();
if (encryptedWithFooterKey) { // Column encrypted with footer key
if (null == fileDecryptor) {
throw new ParquetCryptoRuntimeException(
"Column encrypted with footer key: No keys available");
}
if (null == metaData) {
throw new ParquetCryptoRuntimeException(
"ColumnMetaData not set in Encryption with Footer key");
}
columnPath = getPath(metaData);
if (!encryptedFooter) { // Unencrypted footer. Decrypt full column metadata, using footer
// key
ByteArrayInputStream tempInputStream =
new ByteArrayInputStream(columnChunk.getEncrypted_column_metadata());
byte[] columnMetaDataAAD = AesCipher.createModuleAAD(
fileDecryptor.getFileAAD(),
ModuleType.ColumnMetaData,
rowGroup.getOrdinal(),
columnOrdinal,
-1);
try {
metaData = readColumnMetaData(
tempInputStream, fileDecryptor.fetchFooterDecryptor(), columnMetaDataAAD);
} catch (IOException e) {
throw new ParquetCryptoRuntimeException(
columnPath + ". Failed to decrypt column metadata", e);
}
}
fileDecryptor.setColumnCryptoMetadata(columnPath, true, true, (byte[]) null, columnOrdinal);
} else { // Column encrypted with column key
// setColumnCryptoMetadata triggers KMS interaction, hence delayed until this column is
// projected
lazyMetadataDecryption = true;
}
}
String createdBy = parquetMetadata.getCreated_by();
if (!lazyMetadataDecryption) { // full column metadata (with stats) is available
column = buildColumnChunkMetaData(
metaData,
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType(),
createdBy);
column.setRowGroupOrdinal(rowGroup.getOrdinal());
if (metaData.isSetBloom_filter_offset()) {
column.setBloomFilterOffset(metaData.getBloom_filter_offset());
}
if (metaData.isSetBloom_filter_length()) {
column.setBloomFilterLength(metaData.getBloom_filter_length());
}
} else { // column encrypted with column key
// Metadata will be decrypted later, if this column is accessed
EncryptionWithColumnKey columnKeyStruct = cryptoMetaData.getENCRYPTION_WITH_COLUMN_KEY();
List<String> pathList = columnKeyStruct.getPath_in_schema();
byte[] columnKeyMetadata = columnKeyStruct.getKey_metadata();
columnPath = ColumnPath.get(pathList.toArray(new String[pathList.size()]));
byte[] encryptedMetadataBuffer = columnChunk.getEncrypted_column_metadata();
column = ColumnChunkMetaData.getWithEncryptedMetadata(
this,
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType(),
encryptedMetadataBuffer,
columnKeyMetadata,
fileDecryptor,
rowGroup.getOrdinal(),
columnOrdinal,
createdBy);
}
column.setColumnIndexReference(toColumnIndexReference(columnChunk));
column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
// TODO
// index_page_offset
// key_value_metadata
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<String, String>();
List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
if (key_value_metadata != null) {
for (KeyValue keyValue : key_value_metadata) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
EncryptionType encryptionType;
if (encryptedFooter) {
encryptionType = EncryptionType.ENCRYPTED_FOOTER;
} else if (parquetMetadata.isSetEncryption_algorithm()) {
encryptionType = EncryptionType.PLAINTEXT_FOOTER;
} else {
encryptionType = EncryptionType.UNENCRYPTED;
}
return new ParquetMetadata(
new org.apache.parquet.hadoop.metadata.FileMetaData(
messageType, keyValueMetaData, parquetMetadata.getCreated_by(), encryptionType, fileDecryptor),
blocks);
}