protected CreateSimpleLexer sourceDecorator()

in src/prettify/parser/Prettify.java [611:802]


  protected CreateSimpleLexer sourceDecorator(Map<String, Object> options) throws Exception {
    List<List<Object>> shortcutStylePatterns = new ArrayList<List<Object>>();
    List<List<Object>> fallthroughStylePatterns = new ArrayList<List<Object>>();
    if (Util.getVariableValueAsBoolean(options.get("tripleQuotedStrings"))) {
      // '''multi-line-string''', 'single-line-string', and double-quoted
      shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                Pattern.compile("^(?:\\'\\'\\'(?:[^\\'\\\\]|\\\\[\\s\\S]|\\'{1,2}(?=[^\\']))*(?:\\'\\'\\'|$)|\\\"\\\"\\\"(?:[^\\\"\\\\]|\\\\[\\s\\S]|\\\"{1,2}(?=[^\\\"]))*(?:\\\"\\\"\\\"|$)|\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$))"),
                null,
                "'\""}));
    } else if (Util.getVariableValueAsBoolean(options.get("multiLineStrings"))) {
      // 'multi-line-string', "multi-line-string"
      shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                Pattern.compile("^(?:\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$)|\\`(?:[^\\\\\\`]|\\\\[\\s\\S])*(?:\\`|$))"),
                null,
                "'\"`"}));
    } else {
      // 'single-line-string', "single-line-string"
      shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                Pattern.compile("^(?:\\'(?:[^\\\\\\'\r\n]|\\\\.)*(?:\\'|$)|\\\"(?:[^\\\\\\\"\r\n]|\\\\.)*(?:\\\"|$))"),
                null,
                "\"'"}));
    }
    if (Util.getVariableValueAsBoolean(options.get("verbatimStrings"))) {
      // verbatim-string-literal production from the C# grammar.  See issue 93.
      fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                Pattern.compile("^@\\\"(?:[^\\\"]|\\\"\\\")*(?:\\\"|$)"),
                null}));
    }
    Object hc = options.get("hashComments");
    if (Util.getVariableValueAsBoolean(hc)) {
      if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
        if ((hc instanceof Integer) && (Integer) hc > 1) {  // multiline hash comments
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                    Pattern.compile("^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)"),
                    null,
                    "#"}));
        } else {
          // Stop C preprocessor declarations at an unclosed open comment
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                    Pattern.compile("^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\\b|[^\r\n]*)"),
                    null,
                    "#"}));
        }
        // #include <stdio.h>
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                  Pattern.compile("^<(?:(?:(?:\\.\\.\\/)*|\\/?)(?:[\\w-]+(?:\\/[\\w-]+)+)?[\\w-]+\\.h(?:h|pp|\\+\\+)?|[a-z]\\w*)>"),
                  null}));
      } else {
        shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                  Pattern.compile("^#[^\r\n]*"),
                  null,
                  "#"}));
      }
    }
    if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
      fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                Pattern.compile("^\\/\\/[^\r\n]*"),
                null}));

      fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                Pattern.compile("^\\/\\*[\\s\\S]*?(?:\\*\\/|$)"),
                null}));
    }
    Object regexLiterals = options.get("regexLiterals");
    if (Util.getVariableValueAsBoolean(regexLiterals)) {
      /**
       * @const
       */
      // Javascript treat true as 1
      String regexExcls = Util.getVariableValueAsInteger(regexLiterals) > 1
              ? "" // Multiline regex literals
              : "\n\r";
      /**
       * @const
       */
      String regexAny = !regexExcls.isEmpty() ? "." : "[\\S\\s]";
      /**
       * @const
       */
      String REGEX_LITERAL =
              // A regular expression literal starts with a slash that is
              // not followed by * or / so that it is not confused with
              // comments.
              "/(?=[^/*" + regexExcls + "])"
              // and then contains any number of raw characters,
              + "(?:[^/\\x5B\\x5C" + regexExcls + "]"
              // escape sequences (\x5C),
              + "|\\x5C" + regexAny
              // or non-nesting character sets (\x5B\x5D);
              + "|\\x5B(?:[^\\x5C\\x5D" + regexExcls + "]"
              + "|\\x5C" + regexAny + ")*(?:\\x5D|$))+"
              // finally closed by a /.
              + "/";
      fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-regex",
                Pattern.compile("^" + REGEXP_PRECEDER_PATTERN + "(" + REGEX_LITERAL + ")")}));
    }

    Pattern types = (Pattern) options.get("types");
    if (Util.getVariableValueAsBoolean(types)) {
      fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE, types}));
    }

    String keywords = (String) options.get("keywords");
    if (keywords != null) {
      keywords = keywords.replaceAll("^ | $", "");
      if (keywords.length() != 0) {
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_KEYWORD,
                  Pattern.compile("^(?:" + keywords.replaceAll("[\\s,]+", "|") + ")\\b"),
                  null}));
      }
    }

    shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
              Pattern.compile("^\\s+"),
              null,
              " \r\n\t" + Character.toString((char) 0xA0)
            }));

    // TODO(mikesamuel): recognize non-latin letters and numerals in idents
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
              Pattern.compile("^@[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
              null}));
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE,
              Pattern.compile("^(?:[@_]?[A-Z]+[a-z][A-Za-z_$@0-9]*|\\w+_t\\b)"),
              null}));
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
              Pattern.compile("^[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
              null}));
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
              Pattern.compile("^(?:"
              // A hex number
              + "0x[a-f0-9]+"
              // or an octal or decimal number,
              + "|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)"
              // possibly in scientific notation
              + "(?:e[+\\-]?\\d+)?"
              + ')'
              // with an optional modifier like UL for unsigned long
              + "[a-z]*", Pattern.CASE_INSENSITIVE),
              null,
              "0123456789"}));
    // Don't treat escaped quotes in bash as starting strings.
    // See issue 144.
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
              Pattern.compile("^\\\\[\\s\\S]?"),
              null}));

    // The Bash man page says

    // A word is a sequence of characters considered as a single
    // unit by GRUB. Words are separated by metacharacters,
    // which are the following plus space, tab, and newline: { }
    // | & $ ; < >
    // ...

    // A word beginning with # causes that word and all remaining
    // characters on that line to be ignored.

    // which means that only a '#' after /(?:^|[{}|&$;<>\s])/ starts a
    // comment but empirically
    // $ echo {#}
    // {#}
    // $ echo \$#
    // $#
    // $ echo }#
    // }#

    // so /(?:^|[|&;<>\s])/ is more appropriate.

    // http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_1.html#SEC3
    // suggests that this definition is compatible with a
    // default mode that tries to use a single token definition
    // to recognize both bash/python style comments and C
    // preprocessor directives.

    // This definition of punctuation does not include # in the list of
    // follow-on exclusions, so # will not be broken before if preceeded
    // by a punctuation character.  We could try to exclude # after
    // [|&;<>] but that doesn't seem to cause many major problems.
    // If that does turn out to be a problem, we should change the below
    // when hc is truthy to include # in the run of punctuation characters
    // only when not followint [|&;<>].
    String punctuation = "^.[^\\s\\w.$@'\"`/\\\\]*";
    if (Util.getVariableValueAsBoolean(options.get("regexLiterals"))) {
        punctuation += "(?!\\s*/)";
    }
    fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION,
              Pattern.compile(punctuation),
              null}));

    return new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns);
  }