in parquet-common/src/main/java/org/apache/parquet/glob/GlobParser.java [36:127]
public static GlobNodeSequence parse(String pattern) {
/*
* The parse algorithm works as follows, assuming we are parsing:
* "apache{one,pre{x,y}post,two}parquet{a,b}"
*
* 1) Begin scanning the string until we find the first {
*
* 2) Now that we've found the beginning of a glob group, scan forwards
* until the end of this glob group (by counting { and } we see until we find
* the closing } for the group we found in step 1).
*
* 3) Once the matching closing } is found we need to do two things. First, everything
* from the end of the last group up to start of this group is an Atom, so in the example
* above, once we've found that "{one,pre{x,y}post,two}" is the first group, we need to grab
* "apache" and treat it as an atom and add it to our sequence.
* Then, we parse "{one,pre{x,y}post,two}" using a similar but slightly different function (parseOneOf)
* and add the result from that to our sequence.
*
* 4) Repeat until the end of the string -- so next we find {a,b} and add "parquet" as an Atom and parse
* {a,b} using parseOneOf.
*/
if (pattern.isEmpty() || pattern.equals("{}")) {
return new GlobNodeSequence(Collections.singletonList(new Atom("")));
}
// the outer parse method needs to parse the pattern into a
// GlobNodeSequence, though it may end up being a singleton sequence
List<GlobNode> children = new ArrayList<GlobNode>();
int unmatchedBraces = 0; // count of unmatched braces
int firstBrace = 0; // open brace of current group being processsed
int anchor = 0; // first un-parsed character position
for (int i = 0; i < pattern.length(); i++) {
char c = pattern.charAt(i);
switch (c) {
case ',':
if (unmatchedBraces == 0) {
// commas not allowed in the top level expression
// TODO: maybe turn this check off?
throw new GlobParseException(
"Unexpected comma outside of a {} group:\n" + annotateMessage(pattern, i));
}
break;
case '{':
if (unmatchedBraces == 0) {
// this is the first brace of an outermost {} group
firstBrace = i;
}
unmatchedBraces++;
break;
case '}':
unmatchedBraces--;
if (unmatchedBraces < 0) {
throw new GlobParseException("Unexpected closing }:\n" + annotateMessage(pattern, i));
}
if (unmatchedBraces == 0) {
// grab everything from the end of the last group up to here,
// not including the close brace, it is an Atom in our sequence
// (assuming it's not empty)
if (anchor != firstBrace) {
// not empty!
// (substring's end param is exclusive)
children.add(new Atom(pattern.substring(anchor, firstBrace)));
}
// grab the group, parse it, add it to our sequence, and then continue
// note that we skip the braces on both sides (substring's end param is exclusive)
children.add(parseOneOf(pattern.substring(firstBrace + 1, i)));
// we have now parsed all the way up to here, the next un-parsed char is i + 1
anchor = i + 1;
}
break;
}
}
if (unmatchedBraces > 0) {
throw new GlobParseException("Not enough close braces in: " + pattern);
}
if (anchor != pattern.length()) {
// either there were no {} groups, or there were some characters after the
// last }, either way whatever is left (could be the entire input) is an Atom
// in our sequence
children.add(new Atom(pattern.substring(anchor, pattern.length())));
}
return new GlobNodeSequence(children);
}