private static int findNonBinaryFilesContainingText()

in tools/ci/paimon-ci-tools/src/main/java/org/apache/paimon/tools/ci/licensecheck/JarFileChecker.java [192:248]


    private static int findNonBinaryFilesContainingText(
            Path jar, Path jarRoot, Collection<Pattern> forbidden) throws IOException {
        try (Stream<Path> files = Files.walk(jarRoot)) {
            return files.filter(path -> !path.equals(jarRoot))
                    .filter(path -> !Files.isDirectory(path))
                    .filter(JarFileChecker::isNoClassFile)
                    // frequent false-positives due to dual-licensing; generated by maven
                    .filter(path -> !getFileName(path).equals("dependencies"))
                    // false-positives due to dual-licensing; use startsWith to cover .txt/.md files
                    .filter(path -> !getFileName(path).startsWith("license"))
                    // false-positives due to optional components; startsWith covers .txt/.md files
                    .filter(path -> !getFileName(path).startsWith("notice"))
                    // dual-licensed under GPL 2 and CDDL 1.1
                    // contained in hadoop/presto S3 FS and paimon-dist
                    .filter(path -> !pathStartsWith(path, "/META-INF/versions/11/javax/xml/bind"))
                    .filter(path -> !isJavaxManifest(jar, path))
                    // dual-licensed under GPL 2 and EPL 2.0
                    // contained in sql-avro-confluent-registry
                    .filter(path -> !pathStartsWith(path, "/org/glassfish/jersey/internal"))
                    .map(
                            path -> {
                                try {
                                    final String fileContents;
                                    try {
                                        fileContents = readFile(path).toLowerCase(Locale.ROOT);
                                    } catch (MalformedInputException mie) {
                                        // binary file
                                        return 0;
                                    }

                                    int violations = 0;
                                    for (Pattern text : forbidden) {
                                        if (text.matcher(fileContents).find()) {
                                            // do not count individual violations because it can be
                                            // confusing when checking with aliases for the same
                                            // license
                                            violations = 1;
                                            LOG.error(
                                                    "File '{}' in jar '{}' contains match with forbidden regex '{}'.",
                                                    path,
                                                    jar,
                                                    text);
                                        }
                                    }
                                    return violations;
                                } catch (IOException e) {
                                    throw new RuntimeException(
                                            String.format(
                                                    "Could not read contents of file '%s' in jar '%s'.",
                                                    path, jar),
                                            e);
                                }
                            })
                    .reduce(Integer::sum)
                    .orElse(0);
        }
    }