static protected RangeToken getRange()

in jena-arq/src/main/java/org/apache/jena/ext/xerces_regex/RX_Token.java [735:967]


    static protected RangeToken getRange(String name, boolean positive) {
        if (RX_Token.categories.size() == 0) {
            synchronized (RX_Token.categories) {
                RX_Token[] ranges = new RX_Token[RX_Token.categoryNames.length];
                for (int i = 0;  i < ranges.length;  i ++) {
                    ranges[i] = RX_Token.createRange();
                }
                int type;
                for (int i = 0;  i < 0x10000;  i ++) {
                    type = Character.getType((char)i);
                    if (type == Character.START_PUNCTUATION ||
                        type == Character.END_PUNCTUATION) {
                        //build table of Pi values
                        if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
                            i == 0x201F || i == 0x2039) {
                            type = CHAR_INIT_QUOTE;
                        }
                        //build table of Pf values
                        if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
                            type = CHAR_FINAL_QUOTE;
                        }
                    }
                    ranges[type].addRange(i, i);
                    switch (type) {
                      case Character.UPPERCASE_LETTER:
                      case Character.LOWERCASE_LETTER:
                      case Character.TITLECASE_LETTER:
                      case Character.MODIFIER_LETTER:
                      case Character.OTHER_LETTER:
                        type = CHAR_LETTER;
                        break;
                      case Character.NON_SPACING_MARK:
                      case Character.COMBINING_SPACING_MARK:
                      case Character.ENCLOSING_MARK:
                        type = CHAR_MARK;
                        break;
                      case Character.DECIMAL_DIGIT_NUMBER:
                      case Character.LETTER_NUMBER:
                      case Character.OTHER_NUMBER:
                        type = CHAR_NUMBER;
                        break;
                      case Character.SPACE_SEPARATOR:
                      case Character.LINE_SEPARATOR:
                      case Character.PARAGRAPH_SEPARATOR:
                        type = CHAR_SEPARATOR;
                        break;
                      case Character.CONTROL:
                      case Character.FORMAT:
                      case Character.SURROGATE:
                      case Character.PRIVATE_USE:
                      case Character.UNASSIGNED:
                        type = CHAR_OTHER;
                        break;
                      case Character.CONNECTOR_PUNCTUATION:
                      case Character.DASH_PUNCTUATION:
                      case Character.START_PUNCTUATION:
                      case Character.END_PUNCTUATION:
                      case CHAR_INIT_QUOTE:
                      case CHAR_FINAL_QUOTE:
                      case Character.OTHER_PUNCTUATION:
                        type = CHAR_PUNCTUATION;
                        break;
                      case Character.MATH_SYMBOL:
                      case Character.CURRENCY_SYMBOL:
                      case Character.MODIFIER_SYMBOL:
                      case Character.OTHER_SYMBOL:
                        type = CHAR_SYMBOL;
                        break;
                      default:
                        throw new RuntimeException(RX_Token.class.getName()+"#getRange(): Unknown Unicode category: "+type);
                    }
                    ranges[type].addRange(i, i);
                } // for all characters
                ranges[Character.UNASSIGNED].addRange(0x10000, RX_Token.UTF16_MAX);

                for (int i = 0;  i < ranges.length;  i ++) {
                    if (RX_Token.categoryNames[i] != null) {
                        if (i == Character.UNASSIGNED) { // Unassigned
                            ranges[i].addRange(0x10000, RX_Token.UTF16_MAX);
                        }
                        RX_Token.categories.put(RX_Token.categoryNames[i], ranges[i]);
                        RX_Token.categories2.put(RX_Token.categoryNames[i],
                                              RX_Token.complementRanges(ranges[i]));
                    }
                }
                //REVISIT: do we really need to support block names as in Unicode 3.1
                //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
                //
                StringBuilder buffer = new StringBuilder(50);
                for (int i = 0;  i < RX_Token.blockNames.length;  i ++) {
                    RX_Token r1 = RX_Token.createRange();
                    int location;
                    if (i < NONBMP_BLOCK_START) {
                        location = i*2;
                        int rstart = RX_Token.blockRanges.charAt(location);
                        int rend = RX_Token.blockRanges.charAt(location+1);
                        //DEBUGING
                        //System.out.println(n+" " +Integer.toHexString(rstart)
                        //                     +"-"+ Integer.toHexString(rend));
                        r1.addRange(rstart, rend);
                    } else {
                        location = (i - NONBMP_BLOCK_START) * 2;
                        r1.addRange(RX_Token.nonBMPBlockRanges[location],
                                    RX_Token.nonBMPBlockRanges[location + 1]);
                    }
                    String n = RX_Token.blockNames[i];
                    if (n.equals("Specials"))
                        r1.addRange(0xfff0, 0xfffd);
                    if (n.equals("Private Use")) {
                        r1.addRange(0xF0000,0xFFFFD);
                        r1.addRange(0x100000,0x10FFFD);
                    }
                    RX_Token.categories.put(n, r1);
                    RX_Token.categories2.put(n, RX_Token.complementRanges(r1));
                    buffer.setLength(0);
                    buffer.append("Is");
                    if (n.indexOf(' ') >= 0) {
                        for (int ci = 0;  ci < n.length();  ci ++)
                            if (n.charAt(ci) != ' ')  buffer.append(n.charAt(ci));
                    }
                    else {
                        buffer.append(n);
                    }
                    RX_Token.setAlias(buffer.toString(), n, true);
                }

                // TR#18 1.2
                RX_Token.setAlias("ASSIGNED", "Cn", false);
                RX_Token.setAlias("UNASSIGNED", "Cn", true);
                RX_Token all = RX_Token.createRange();
                all.addRange(0, RX_Token.UTF16_MAX);
                RX_Token.categories.put("ALL", all);
                RX_Token.categories2.put("ALL", RX_Token.complementRanges(all));
                RX_Token.registerNonXS("ASSIGNED");
                RX_Token.registerNonXS("UNASSIGNED");
                RX_Token.registerNonXS("ALL");

                RX_Token isalpha = RX_Token.createRange();
                isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
                isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
                isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
                RX_Token.categories.put("IsAlpha", isalpha);
                RX_Token.categories2.put("IsAlpha", RX_Token.complementRanges(isalpha));
                RX_Token.registerNonXS("IsAlpha");

                RX_Token isalnum = RX_Token.createRange();
                isalnum.mergeRanges(isalpha);   // Lu Ll Lo
                isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
                RX_Token.categories.put("IsAlnum", isalnum);
                RX_Token.categories2.put("IsAlnum", RX_Token.complementRanges(isalnum));
                RX_Token.registerNonXS("IsAlnum");

                RX_Token isspace = RX_Token.createRange();
                isspace.mergeRanges(RX_Token.token_spaces);
                isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
                RX_Token.categories.put("IsSpace", isspace);
                RX_Token.categories2.put("IsSpace", RX_Token.complementRanges(isspace));
                RX_Token.registerNonXS("IsSpace");

                RX_Token isword = RX_Token.createRange();
                isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
                isword.addRange('_', '_');
                RX_Token.categories.put("IsWord", isword);
                RX_Token.categories2.put("IsWord", RX_Token.complementRanges(isword));
                RX_Token.registerNonXS("IsWord");

                RX_Token isascii = RX_Token.createRange();
                isascii.addRange(0, 127);
                RX_Token.categories.put("IsASCII", isascii);
                RX_Token.categories2.put("IsASCII", RX_Token.complementRanges(isascii));
                RX_Token.registerNonXS("IsASCII");

                RX_Token isnotgraph = RX_Token.createRange();
                isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
                isnotgraph.addRange(' ', ' ');
                RX_Token.categories.put("IsGraph", RX_Token.complementRanges(isnotgraph));
                RX_Token.categories2.put("IsGraph", isnotgraph);
                RX_Token.registerNonXS("IsGraph");

                RX_Token isxdigit = RX_Token.createRange();
                isxdigit.addRange('0', '9');
                isxdigit.addRange('A', 'F');
                isxdigit.addRange('a', 'f');
                RX_Token.categories.put("IsXDigit", RX_Token.complementRanges(isxdigit));
                RX_Token.categories2.put("IsXDigit", isxdigit);
                RX_Token.registerNonXS("IsXDigit");

                RX_Token.setAlias("IsDigit", "Nd", true);
                RX_Token.setAlias("IsUpper", "Lu", true);
                RX_Token.setAlias("IsLower", "Ll", true);
                RX_Token.setAlias("IsCntrl", "C", true);
                RX_Token.setAlias("IsPrint", "C", false);
                RX_Token.setAlias("IsPunct", "P", true);
                RX_Token.registerNonXS("IsDigit");
                RX_Token.registerNonXS("IsUpper");
                RX_Token.registerNonXS("IsLower");
                RX_Token.registerNonXS("IsCntrl");
                RX_Token.registerNonXS("IsPrint");
                RX_Token.registerNonXS("IsPunct");

                RX_Token.setAlias("alpha", "IsAlpha", true);
                RX_Token.setAlias("alnum", "IsAlnum", true);
                RX_Token.setAlias("ascii", "IsASCII", true);
                RX_Token.setAlias("cntrl", "IsCntrl", true);
                RX_Token.setAlias("digit", "IsDigit", true);
                RX_Token.setAlias("graph", "IsGraph", true);
                RX_Token.setAlias("lower", "IsLower", true);
                RX_Token.setAlias("print", "IsPrint", true);
                RX_Token.setAlias("punct", "IsPunct", true);
                RX_Token.setAlias("space", "IsSpace", true);
                RX_Token.setAlias("upper", "IsUpper", true);
                RX_Token.setAlias("word", "IsWord", true); // Perl extension
                RX_Token.setAlias("xdigit", "IsXDigit", true);
                RX_Token.registerNonXS("alpha");
                RX_Token.registerNonXS("alnum");
                RX_Token.registerNonXS("ascii");
                RX_Token.registerNonXS("cntrl");
                RX_Token.registerNonXS("digit");
                RX_Token.registerNonXS("graph");
                RX_Token.registerNonXS("lower");
                RX_Token.registerNonXS("print");
                RX_Token.registerNonXS("punct");
                RX_Token.registerNonXS("space");
                RX_Token.registerNonXS("upper");
                RX_Token.registerNonXS("word");
                RX_Token.registerNonXS("xdigit");
            } // synchronized
        } // if null
        RangeToken tok = positive ? (RangeToken)RX_Token.categories.get(name)
            : (RangeToken)RX_Token.categories2.get(name);
        //if (tok == null) System.out.println(name);
        return tok;
    }