in xtable-core/src/main/java/org/apache/xtable/hudi/HudiFileStatsExtractor.java [124:168]
private Stream<InternalDataFile> computeColumnStatsFromMetadataTable(
HoodieTableMetadata metadataTable,
Stream<InternalDataFile> files,
Map<String, InternalField> nameFieldMap) {
Map<Pair<String, String>, InternalDataFile> filePathsToDataFile =
files.collect(
Collectors.toMap(
file -> getPartitionAndFileName(file.getPhysicalPath()), Function.identity()));
if (filePathsToDataFile.isEmpty()) {
return Stream.empty();
}
List<Pair<String, String>> filePaths = new ArrayList<>(filePathsToDataFile.keySet());
Map<Pair<String, String>, List<Pair<InternalField, HoodieMetadataColumnStats>>> stats =
nameFieldMap.entrySet().parallelStream()
.flatMap(
fieldNameToField -> {
String fieldName = fieldNameToField.getKey();
InternalField field = fieldNameToField.getValue();
return metadataTable.getColumnStats(filePaths, fieldName).entrySet().stream()
.map(
filePairToStats ->
Pair.of(
filePairToStats.getKey(),
Pair.of(field, filePairToStats.getValue())));
})
.collect(
Collectors.groupingBy(
Map.Entry::getKey,
Collectors.mapping(
Map.Entry::getValue, CustomCollectors.toList(nameFieldMap.size()))));
return filePathsToDataFile.entrySet().stream()
.map(
pathToDataFile -> {
Pair<String, String> filePath = pathToDataFile.getKey();
InternalDataFile file = pathToDataFile.getValue();
List<Pair<InternalField, HoodieMetadataColumnStats>> fileStats =
stats.getOrDefault(filePath, Collections.emptyList());
List<ColumnStat> columnStats =
fileStats.stream()
.map(pair -> getColumnStatFromHudiStat(pair.getLeft(), pair.getRight()))
.collect(CustomCollectors.toList(fileStats.size()));
long recordCount = getMaxFromColumnStats(columnStats).orElse(0L);
return file.toBuilder().columnStats(columnStats).recordCount(recordCount).build();
});
}