in src/prettify/parser/Prettify.java [611:802]
protected CreateSimpleLexer sourceDecorator(Map<String, Object> options) throws Exception {
List<List<Object>> shortcutStylePatterns = new ArrayList<List<Object>>();
List<List<Object>> fallthroughStylePatterns = new ArrayList<List<Object>>();
if (Util.getVariableValueAsBoolean(options.get("tripleQuotedStrings"))) {
// '''multi-line-string''', 'single-line-string', and double-quoted
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Pattern.compile("^(?:\\'\\'\\'(?:[^\\'\\\\]|\\\\[\\s\\S]|\\'{1,2}(?=[^\\']))*(?:\\'\\'\\'|$)|\\\"\\\"\\\"(?:[^\\\"\\\\]|\\\\[\\s\\S]|\\\"{1,2}(?=[^\\\"]))*(?:\\\"\\\"\\\"|$)|\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$))"),
null,
"'\""}));
} else if (Util.getVariableValueAsBoolean(options.get("multiLineStrings"))) {
// 'multi-line-string', "multi-line-string"
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Pattern.compile("^(?:\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$)|\\`(?:[^\\\\\\`]|\\\\[\\s\\S])*(?:\\`|$))"),
null,
"'\"`"}));
} else {
// 'single-line-string', "single-line-string"
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Pattern.compile("^(?:\\'(?:[^\\\\\\'\r\n]|\\\\.)*(?:\\'|$)|\\\"(?:[^\\\\\\\"\r\n]|\\\\.)*(?:\\\"|$))"),
null,
"\"'"}));
}
if (Util.getVariableValueAsBoolean(options.get("verbatimStrings"))) {
// verbatim-string-literal production from the C# grammar. See issue 93.
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Pattern.compile("^@\\\"(?:[^\\\"]|\\\"\\\")*(?:\\\"|$)"),
null}));
}
Object hc = options.get("hashComments");
if (Util.getVariableValueAsBoolean(hc)) {
if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
if ((hc instanceof Integer) && (Integer) hc > 1) { // multiline hash comments
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
Pattern.compile("^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)"),
null,
"#"}));
} else {
// Stop C preprocessor declarations at an unclosed open comment
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
Pattern.compile("^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\\b|[^\r\n]*)"),
null,
"#"}));
}
// #include <stdio.h>
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
Pattern.compile("^<(?:(?:(?:\\.\\.\\/)*|\\/?)(?:[\\w-]+(?:\\/[\\w-]+)+)?[\\w-]+\\.h(?:h|pp|\\+\\+)?|[a-z]\\w*)>"),
null}));
} else {
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
Pattern.compile("^#[^\r\n]*"),
null,
"#"}));
}
}
if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
Pattern.compile("^\\/\\/[^\r\n]*"),
null}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
Pattern.compile("^\\/\\*[\\s\\S]*?(?:\\*\\/|$)"),
null}));
}
Object regexLiterals = options.get("regexLiterals");
if (Util.getVariableValueAsBoolean(regexLiterals)) {
/**
* @const
*/
// Javascript treat true as 1
String regexExcls = Util.getVariableValueAsInteger(regexLiterals) > 1
? "" // Multiline regex literals
: "\n\r";
/**
* @const
*/
String regexAny = !regexExcls.isEmpty() ? "." : "[\\S\\s]";
/**
* @const
*/
String REGEX_LITERAL =
// A regular expression literal starts with a slash that is
// not followed by * or / so that it is not confused with
// comments.
"/(?=[^/*" + regexExcls + "])"
// and then contains any number of raw characters,
+ "(?:[^/\\x5B\\x5C" + regexExcls + "]"
// escape sequences (\x5C),
+ "|\\x5C" + regexAny
// or non-nesting character sets (\x5B\x5D);
+ "|\\x5B(?:[^\\x5C\\x5D" + regexExcls + "]"
+ "|\\x5C" + regexAny + ")*(?:\\x5D|$))+"
// finally closed by a /.
+ "/";
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-regex",
Pattern.compile("^" + REGEXP_PRECEDER_PATTERN + "(" + REGEX_LITERAL + ")")}));
}
Pattern types = (Pattern) options.get("types");
if (Util.getVariableValueAsBoolean(types)) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE, types}));
}
String keywords = (String) options.get("keywords");
if (keywords != null) {
keywords = keywords.replaceAll("^ | $", "");
if (keywords.length() != 0) {
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_KEYWORD,
Pattern.compile("^(?:" + keywords.replaceAll("[\\s,]+", "|") + ")\\b"),
null}));
}
}
shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
Pattern.compile("^\\s+"),
null,
" \r\n\t" + Character.toString((char) 0xA0)
}));
// TODO(mikesamuel): recognize non-latin letters and numerals in idents
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
Pattern.compile("^@[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
null}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE,
Pattern.compile("^(?:[@_]?[A-Z]+[a-z][A-Za-z_$@0-9]*|\\w+_t\\b)"),
null}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
Pattern.compile("^[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
null}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
Pattern.compile("^(?:"
// A hex number
+ "0x[a-f0-9]+"
// or an octal or decimal number,
+ "|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)"
// possibly in scientific notation
+ "(?:e[+\\-]?\\d+)?"
+ ')'
// with an optional modifier like UL for unsigned long
+ "[a-z]*", Pattern.CASE_INSENSITIVE),
null,
"0123456789"}));
// Don't treat escaped quotes in bash as starting strings.
// See issue 144.
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
Pattern.compile("^\\\\[\\s\\S]?"),
null}));
// The Bash man page says
// A word is a sequence of characters considered as a single
// unit by GRUB. Words are separated by metacharacters,
// which are the following plus space, tab, and newline: { }
// | & $ ; < >
// ...
// A word beginning with # causes that word and all remaining
// characters on that line to be ignored.
// which means that only a '#' after /(?:^|[{}|&$;<>\s])/ starts a
// comment but empirically
// $ echo {#}
// {#}
// $ echo \$#
// $#
// $ echo }#
// }#
// so /(?:^|[|&;<>\s])/ is more appropriate.
// http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_1.html#SEC3
// suggests that this definition is compatible with a
// default mode that tries to use a single token definition
// to recognize both bash/python style comments and C
// preprocessor directives.
// This definition of punctuation does not include # in the list of
// follow-on exclusions, so # will not be broken before if preceeded
// by a punctuation character. We could try to exclude # after
// [|&;<>] but that doesn't seem to cause many major problems.
// If that does turn out to be a problem, we should change the below
// when hc is truthy to include # in the run of punctuation characters
// only when not followint [|&;<>].
String punctuation = "^.[^\\s\\w.$@'\"`/\\\\]*";
if (Util.getVariableValueAsBoolean(options.get("regexLiterals"))) {
punctuation += "(?!\\s*/)";
}
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION,
Pattern.compile(punctuation),
null}));
return new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns);
}