in tools/ci/paimon-ci-tools/src/main/java/org/apache/paimon/tools/ci/licensecheck/JarFileChecker.java [192:248]
private static int findNonBinaryFilesContainingText(
Path jar, Path jarRoot, Collection<Pattern> forbidden) throws IOException {
try (Stream<Path> files = Files.walk(jarRoot)) {
return files.filter(path -> !path.equals(jarRoot))
.filter(path -> !Files.isDirectory(path))
.filter(JarFileChecker::isNoClassFile)
// frequent false-positives due to dual-licensing; generated by maven
.filter(path -> !getFileName(path).equals("dependencies"))
// false-positives due to dual-licensing; use startsWith to cover .txt/.md files
.filter(path -> !getFileName(path).startsWith("license"))
// false-positives due to optional components; startsWith covers .txt/.md files
.filter(path -> !getFileName(path).startsWith("notice"))
// dual-licensed under GPL 2 and CDDL 1.1
// contained in hadoop/presto S3 FS and paimon-dist
.filter(path -> !pathStartsWith(path, "/META-INF/versions/11/javax/xml/bind"))
.filter(path -> !isJavaxManifest(jar, path))
// dual-licensed under GPL 2 and EPL 2.0
// contained in sql-avro-confluent-registry
.filter(path -> !pathStartsWith(path, "/org/glassfish/jersey/internal"))
.map(
path -> {
try {
final String fileContents;
try {
fileContents = readFile(path).toLowerCase(Locale.ROOT);
} catch (MalformedInputException mie) {
// binary file
return 0;
}
int violations = 0;
for (Pattern text : forbidden) {
if (text.matcher(fileContents).find()) {
// do not count individual violations because it can be
// confusing when checking with aliases for the same
// license
violations = 1;
LOG.error(
"File '{}' in jar '{}' contains match with forbidden regex '{}'.",
path,
jar,
text);
}
}
return violations;
} catch (IOException e) {
throw new RuntimeException(
String.format(
"Could not read contents of file '%s' in jar '%s'.",
path, jar),
e);
}
})
.reduce(Integer::sum)
.orElse(0);
}
}