in java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikeMatcher.java [58:144]
public static LikeMatcher compile(String pattern, Optional<Character> escape, boolean optimize) {
List<Pattern> parsed = parse(pattern, escape);
// Calculate minimum and maximum size for candidate strings
// This is used for short-circuiting the match if the size of
// the input is outside those bounds
int minSize = 0;
int maxSize = 0;
boolean unbounded = false;
for (Pattern expression : parsed) {
if (expression instanceof Literal) {
Literal literal = (Literal) expression;
int length = literal.getValue().getBytes(UTF_8).length;
minSize += length;
maxSize += length;
} else if (expression instanceof ZeroOrMore) {
unbounded = true;
} else if (expression instanceof Any) {
Any any = (Any) expression;
int length = any.getLength();
minSize += length;
maxSize += length * 4; // at most 4 bytes for a single UTF-8 codepoint
}
}
// Calculate exact match prefix and suffix
// If the pattern starts and ends with a literal, we can perform a quick
// exact match to short-circuit DFA evaluation
byte[] prefix = new byte[0];
byte[] suffix = new byte[0];
int patternStart = 0;
int patternEnd = parsed.size() - 1;
if (parsed.size() > 0 && parsed.get(0) instanceof Literal) {
Literal literal = (Literal) parsed.get(0);
prefix = literal.getValue().getBytes(UTF_8);
patternStart++;
}
if (parsed.size() > 1 && parsed.get(parsed.size() - 1) instanceof Literal) {
Literal literal = (Literal) parsed.get(parsed.size() - 1);
suffix = literal.getValue().getBytes(UTF_8);
patternEnd--;
}
// If the pattern (after excluding constant prefix/suffixes) ends with an unbounded match (i.e.,
// %)
// we can perform a non-exact match and end as soon as the DFA reaches an accept state -- there
// is no need to consume the remaining input
// This section determines whether the pattern is a candidate for non-exact match.
boolean exact = true; // whether to match to the end of the input
if (patternStart <= patternEnd && parsed.get(patternEnd) instanceof ZeroOrMore) {
// guaranteed to be Any or ZeroOrMore because any Literal would've been turned into a suffix
// above
exact = false;
patternEnd--;
}
Optional<Matcher> matcher = Optional.empty();
if (patternStart <= patternEnd) {
boolean hasAny = false;
for (int i = patternStart; i <= patternEnd; i++) {
Pattern item = parsed.get(i);
if (item instanceof Any) {
hasAny = true;
break;
}
}
if (hasAny) {
if (optimize) {
matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact));
} else {
matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact));
}
} else {
matcher = Optional.of(new FjsMatcher(parsed, patternStart, patternEnd, exact));
}
}
return new LikeMatcher(
minSize,
unbounded ? OptionalInt.empty() : OptionalInt.of(maxSize),
prefix,
suffix,
matcher);
}