public static SanitizeResult sanitizer()

in app/src/main/java/org/apache/roller/weblogger/util/HTMLSanitizer.java [122:366]


    public static SanitizeResult sanitizer(String html, Pattern allowedTags, Pattern forbiddenTags) {
        SanitizeResult ret = new SanitizeResult();
        Stack<String> openTags = new Stack<>();


        List<String> tokens = tokenize(html);

        // -------------------   LOOP for every token --------------------------
        for (String token : tokens) {
            boolean isAcceptedToken = false;

            Matcher startMatcher = tagStartPattern.matcher(token);
            Matcher endMatcher = tagClosePattern.matcher(token);


            //--------------------------------------------------------------------------------  COMMENT    <!-- ......... -->
            if (commentPattern.matcher(token).find()) {
                ret.val = ret.val + token + (token.endsWith("-->") ? "" : "-->");
                ret.invalidTags.add(token + (token.endsWith("-->") ? "" : "-->"));
                continue;


                //--------------------------------------------------------------------------------  OPEN TAG    <tag .........>
            } else if (startMatcher.find()) {

                //tag name extraction
                String tag = startMatcher.group(1).toLowerCase();


                //-----------------------------------------------------  FORBIDDEN TAG   <script .........>
                if (forbiddenTags.matcher(tag).find()) {
                    ret.invalidTags.add("<" + tag + ">");
                    continue;


                    // --------------------------------------------------  WELL KNOWN TAG
                } else if (allowedTags.matcher(tag).find()) {


                    String cleanToken = "<" + tag;
                    String tokenBody = startMatcher.group(2);


                    //first test table consistency
                    //table tbody tfoot thead th tr td
                    if ("thead".equals(tag) || "tbody".equals(tag) || "tfoot".equals(tag) || "tr".equals(tag)) {
                        if (openTags.search("table") < 1) {
                            ret.invalidTags.add("<" + tag + ">");
                            continue;
                        }
                    } else if (("td".equals(tag) || "th".equals(tag)) && openTags.search("tr") < 1) {
                        ret.invalidTags.add("<" + tag + ">");
                        continue;
                    }


                    // then test properties
                    Matcher attributes = attributesPattern.matcher(tokenBody);

                    // URL flag
                    boolean foundURL = false;
                    while (attributes.find()) {

                        String attr = attributes.group(1).toLowerCase();
                        String val = attributes.group(2);

                        // we will accept href in case of <A>
                        // <a href="......">
                        if ("a".equals(tag) && "href".equals(attr)) {
                            String[] customSchemes = {"http", "https"};
                            if (new UrlValidator(customSchemes).isValid(val)) {
                                foundURL = true;
                            } else {
                                // may be it is a mailto?
                                // case <a href="mailto:pippo@pippo.com?subject=...."
                                if (val.toLowerCase().startsWith("mailto:") && val.indexOf('@') >= 0) {
                                    String val1 = "http://www." + val.substring(val.indexOf('@') + 1);
                                    if (new UrlValidator(customSchemes).isValid(val1)) {
                                        foundURL = true;
                                    } else {
                                        ret.invalidTags.add(attr + " " + val);
                                        val = "";
                                    }
                                } else {
                                    ret.invalidTags.add(attr + " " + val);
                                    val = "";
                                }
                            }

                        } else if (tag.matches("img|embed") && "src".equals(attr)) {
                            // <img src="......">
                            String[] customSchemes = {"http", "https"};
                            if (new UrlValidator(customSchemes).isValid(val)) {
                                foundURL = true;
                            } else {
                                ret.invalidTags.add(attr + " " + val);
                                val = "";
                            }
                        } else if ("href".equals(attr) || "src".equals(attr)) {
                            // <tag src/href="......">   skipped
                            ret.invalidTags.add(tag + " " + attr + " " + val);
                            continue;
                        } else if (attr.matches("width|height")) {
                            // <tag width/height="......">
                            if (!val.toLowerCase().matches("\\d+%|\\d+$")) {
                                // test numeric values
                                ret.invalidTags.add(tag + " " + attr + " " + val);
                                continue;
                            }

                        } else if ("style".equals(attr)) {
                            // <tag style="......">
                            // then test properties
                            Matcher styles = stylePattern.matcher(val);
                            String cleanStyle = "";

                            while (styles.find()) {
                                String styleName = styles.group(1).toLowerCase();
                                String styleValue = styles.group(2);

                                // suppress invalid styles values
                                if (forbiddenStylePattern.matcher(styleValue).find()) {
                                    ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                    continue;
                                }

                                // check if valid url
                                Matcher urlStyleMatcher = urlStylePattern.matcher(styleValue);
                                if (urlStyleMatcher.find()) {
                                    String[] customSchemes = {"http", "https"};
                                    String url = urlStyleMatcher.group(1);
                                    if (!new UrlValidator(customSchemes).isValid(url)) {
                                        ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                        continue;
                                    }
                                }

                                cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";

                            }
                            val = cleanStyle;

                        } else if (attr.startsWith("on")) {
                            // skip all javascript events
                            ret.invalidTags.add(tag + " " + attr + " " + val);
                            continue;

                        } else {
                            // by default encode all properties
                            val = encode(val);
                        }

                        cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
                    }
                    cleanToken = cleanToken + ">";

                    isAcceptedToken = true;

                    // for <img> and <a>
                    if (tag.matches("a|img|embed") && !foundURL) {
                        isAcceptedToken = false;
                        cleanToken = "";
                    }

                    token = cleanToken;


                    // push the tag if require closure and it is accepted (otherwise is encoded)
                    if (isAcceptedToken && !(standAloneTags.matcher(tag).find() || selfClosed.matcher(tag).find())) {
                        openTags.push(tag);
                    }

                    // --------------------------------------------------------------------------------  UNKNOWN TAG
                } else {
                    ret.invalidTags.add(token);
                    ret.val = ret.val + token;
                    continue;


                }

                // --------------------------------------------------------------------------------  CLOSE TAG </tag>
            } else if (endMatcher.find()) {
                String tag = endMatcher.group(1).toLowerCase();

                //is self closing
                if (selfClosed.matcher(tag).find()) {
                    ret.invalidTags.add(token);
                    continue;
                }
                if (forbiddenTags.matcher(tag).find()) {
                    ret.invalidTags.add("/" + tag);
                    continue;
                }
                if (!allowedTags.matcher(tag).find()) {
                    ret.invalidTags.add(token);
                    ret.val = ret.val + token;
                    continue;
                } else {


                    String cleanToken = "";

                    // check tag position in the stack
                    int pos = openTags.search(tag);
                    // if found on top ok
                    for (int i = 1; i <= pos; i++) {
                        //pop all elements before tag and close it
                        String poppedTag = openTags.pop();
                        cleanToken = cleanToken + "</" + poppedTag + ">";
                        isAcceptedToken = true;
                    }

                    token = cleanToken;
                }

            }

            ret.val = ret.val + token;

            if (isAcceptedToken) {
                ret.html = ret.html + token;
                //ret.text = ret.text + " ";
            } else {
                String sanToken = htmlEncodeApexesAndTags(token);
                ret.html = ret.html + sanToken;
                ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
            }


        }

        // must close remaining tags
        while (!openTags.isEmpty()) {
            //pop all elements before tag and close it
            String poppedTag = openTags.pop();
            ret.html = ret.html + "</" + poppedTag + ">";
            ret.val = ret.val + "</" + poppedTag + ">";
        }

        //set boolean value
        ret.isValid = ret.invalidTags.isEmpty();

        return ret;
    }