public static GlobNodeSequence parse()

in parquet-common/src/main/java/org/apache/parquet/glob/GlobParser.java [36:127]


  public static GlobNodeSequence parse(String pattern) {
    /*
     * The parse algorithm works as follows, assuming we are parsing:
     * "apache{one,pre{x,y}post,two}parquet{a,b}"
     *
     * 1) Begin scanning the string until we find the first {
     *
     * 2) Now that we've found the beginning of a glob group, scan forwards
     *    until the end of this glob group (by counting { and } we see until we find
     *    the closing } for the group we found in step 1).
     *
     * 3) Once the matching closing } is found we need to do two things. First, everything
     *    from the end of the last group up to start of this group is an Atom, so in the example
     *    above, once we've found that "{one,pre{x,y}post,two}" is the first group, we need to grab
     *    "apache" and treat it as an atom and add it to our sequence.
     *    Then, we parse "{one,pre{x,y}post,two}" using a similar but slightly different function (parseOneOf)
     *    and add the result from that to our sequence.
     *
     * 4) Repeat until the end of the string -- so next we find {a,b} and add "parquet" as an Atom and parse
     *    {a,b} using parseOneOf.
     */

    if (pattern.isEmpty() || pattern.equals("{}")) {
      return new GlobNodeSequence(Collections.singletonList(new Atom("")));
    }

    // the outer parse method needs to parse the pattern into a
    // GlobNodeSequence, though it may end up being a singleton sequence
    List<GlobNode> children = new ArrayList<GlobNode>();

    int unmatchedBraces = 0; // count of unmatched braces
    int firstBrace = 0; // open brace of current group being processsed
    int anchor = 0; // first un-parsed character position

    for (int i = 0; i < pattern.length(); i++) {
      char c = pattern.charAt(i);

      switch (c) {
        case ',':
          if (unmatchedBraces == 0) {
            // commas not allowed in the top level expression
            // TODO: maybe turn this check off?
            throw new GlobParseException(
                "Unexpected comma outside of a {} group:\n" + annotateMessage(pattern, i));
          }
          break;
        case '{':
          if (unmatchedBraces == 0) {
            // this is the first brace of an outermost {} group
            firstBrace = i;
          }
          unmatchedBraces++;
          break;
        case '}':
          unmatchedBraces--;
          if (unmatchedBraces < 0) {
            throw new GlobParseException("Unexpected closing }:\n" + annotateMessage(pattern, i));
          }
          if (unmatchedBraces == 0) {
            // grab everything from the end of the last group up to here,
            // not including the close brace, it is an Atom in our sequence
            // (assuming it's not empty)
            if (anchor != firstBrace) {
              // not empty!
              // (substring's end param is exclusive)
              children.add(new Atom(pattern.substring(anchor, firstBrace)));
            }

            // grab the group, parse it, add it to our sequence, and then continue
            // note that we skip the braces on both sides (substring's end param is exclusive)
            children.add(parseOneOf(pattern.substring(firstBrace + 1, i)));

            // we have now parsed all the way up to here, the next un-parsed char is i + 1
            anchor = i + 1;
          }
          break;
      }
    }

    if (unmatchedBraces > 0) {
      throw new GlobParseException("Not enough close braces in: " + pattern);
    }

    if (anchor != pattern.length()) {
      // either there were no {} groups, or there were some characters after the
      // last }, either way whatever is left (could be the entire input) is an Atom
      // in our sequence
      children.add(new Atom(pattern.substring(anchor, pattern.length())));
    }

    return new GlobNodeSequence(children);
  }