private static LangTagRFC5646 parser()

in jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java [234:561]


    private static LangTagRFC5646 parser(String string) {

        // A segment is a sequence of A2ZN characters separated by '-'.

        LangTagRFC5646 langtag = new LangTagRFC5646(string);
        final int N = string.length();
        // @formatter:off
        //         langtag       = language
        //                         ["-" script]
        //                         ["-" region]
        //                         *("-" variant)
        //                         *("-" extension)
        //                         ["-" privateuse]
        //
        //         language      = 2*3ALPHA            ; shortest ISO 639 code
        //                         ["-" extlang]       ; sometimes followed by
        //                                             ; extended language subtags
        //                       / 4ALPHA              ; or reserved for future use
        //                       / 5*8ALPHA            ; or registered language subtag
        //
        //         extlang       = 3ALPHA              ; selected ISO 639 codes
        //                         *2("-" 3ALPHA)      ; permanently reserved
        //
        //         script        = 4ALPHA              ; ISO 15924 code
        //
        //         region        = 2ALPHA              ; ISO 3166-1 code
        //                       / 3DIGIT              ; UN M.49 code
        //
        //         variant       = 5*8alphanum         ; registered variants
        //                       / (DIGIT 3alphanum)
        //
        //         extension     = singleton 1*("-" (2*8alphanum))
        //
        //                                             ; Single alphanumerics
        //                                             ; "x" reserved for private use
        //         singleton     = DIGIT               ; 0 - 9
        //                       / %x41-57             ; A - W
        //                       / %x59-5A             ; Y - Z
        //                       / %x61-77             ; a - w
        //                       / %x79-7A             ; y - z
        //
        //         privateuse    = "x" 1*("-" (1*8alphanum))
        // @formatter:on

        if ( N == 0 )
            InternalLangTag.error("Empty string");

        // -------------------
        // language      = (2*3ALPHA [ extlang ]); shortest ISO 639 code
        //               / 4ALPHA                ; reserved for future use
        //               / 5*8ALPHA              ; registered language subtag
        // extlang       = 3ALPHA              ; selected ISO 639 codes
        //                 *2("-" 3ALPHA)      ; permanently reserved

        // Grandfathered
        // Must check first because the whole string (except "en-GB-oed") is the "language"

        if ( grandfathered(string) ) {
            // Regular:
            // "each tag, in its entirety, represents a language or collection of languages."
            //
            // Irregular:
            // With the exception of "en-GB-oed", which is a
            // variant of "en-GB", each of them, in its entirety,
            // represents a language.
            //
            langtag.language0 = 0;
            langtag.language1 = N;
            langtag.isGrandfathered = true;
            // Exception.
            if ( string.equalsIgnoreCase("en-GB-oed") ) {
                // "oed" is "Oxford English Dictionary spelling"
                // Better is the replacement "en-GB-oxendict"
                langtag.language0 = 0;
                langtag.language1 = 2;
                langtag.region0 = 3;
                langtag.region1 = 5;
                // Non-standard variant.
                langtag.variant0 = 6;
                langtag.variant1 = N;
            }
            return langtag;
        }

        // -- language

        int idx = 0;
        int idx2 = segmentNextFinish(string, N, idx);
        int segLen = segmentLength(N, idx, idx2);

        // Private use in the language position.
        if ( segLen == 1 ) {
            if ( string.startsWith("x-") || string.startsWith("X-") ) {
                /*
                The primary language subtag is the first subtag in a language tag and
                cannot be omitted, with two exceptions:

                o  The single-character subtag 'x' as the primary subtag indicates
                   that the language tag consists solely of subtags whose meaning is
                   defined by private agreement.  For example, in the tag "x-fr-CH",
                   the subtags 'fr' and 'CH' do not represent the French language or
                   the country of Switzerland (or any other value in the IANA
                   registry) unless there is a private agreement in place to do so.
                   See Section 4.6.
                */
                langtag.isPrivateUseLanguage = true;
                int idxPrivateUseStart = 0;
                int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart+segLen, 1, 8);
                langtag.privateuse0 = idxPrivateUseStart;
                langtag.privateuse1 = idxPrivateUseEnd;
                if ( langtag.privateuse1 < N )
                    InternalLangTag.error("Trailing characters in private langtag: '%s'", string.substring(langtag.privateuse1));
                return langtag;
            }
            // else
            InternalLangTag.error("Language part is 1 character: it must be 2-3 characters (4-8 reserved for future use), \"x-\", or a recognized grandfathered tag");
        }

        if ( segLen > 8 )
            InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)");

        if ( idx2 < 0 ) {
            // language only.
            langtag.language0 = 0;
            langtag.language1 = N;
            InternalLangTag.checkAlpha(string, N, langtag.language0, langtag.language1);
            return langtag;
        }

        if ( idx == idx2 )
            InternalLangTag.error("Can not find the language subtag: '%s'", string);

        langtag.language0 = idx;

        if ( segLen == 2 || segLen == 3 ) {
            // -- Language extension subtags/
//            language      = 2*3ALPHA            ; shortest ISO 639 code
//                            ["-" extlang]
//            extlang       = 3ALPHA              ; selected ISO 639 codes
//                            *2("-" 3ALPHA)      ; permanently reserved
            int extStart = idx+segLen;
            InternalLangTag.checkAlpha(string, N, langtag.language0, extStart);
            // Extensions are 1 to 3 3ALPHA subtags
            int extEnd = maybeSubtags(string, N, extStart, 3, 3);
            if ( extEnd > extStart ) {
                idx2 = extEnd;
                InternalLangTag.checkAlphaMinus(string, N, extStart, langtag.language1);
            }
        } else if ( segLen >= 4 && segLen <= 8 ) {
            //                       / 4ALPHA              ; or reserved for future use
            //                       / 5*8ALPHA            ; or registered language subtag
            // Dubious.
            InternalLangTag.checkAlpha(string, N, langtag.language0, idx2);
        } else {
            InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)");
        }

        langtag.language1 = idx2;
        // Info
        noteSegment("language", string, langtag.language0, langtag.language1);

        // Move on - next subtag
        idx = segmentNextStart(N, idx, idx2);
        idx2 = segmentNextFinish(string, N, idx);
        segLen = segmentLength(N, idx, idx2);
        // -- End langtag

        // ---- script
        // script        = 4ALPHA              ; ISO 15924 code
        if ( segLen == 4 && InternalLangTag.isAlpha(string.charAt(idx)) ) {
            // Script
            // Not a digit - which is a variant.
            // variant       = ... / (DIGIT 3alphanum)
            int start = idx;
            int finish = idx+segLen;

            langtag.script0 = idx;
            langtag.script1 = idx+segLen;
            InternalLangTag.checkAlpha(string, N, langtag.script0, langtag.script1);
            noteSegment("script", string, langtag.script0, langtag.script1);

            // Move on.
            idx = segmentNextStart(N, idx, idx2);
            idx2 = segmentNextFinish(string, N, idx);
            segLen = segmentLength(N, idx, idx2);
        }
        // -- End script

        // ---- region
        // region        = 2ALPHA              ; ISO 3166-1 code
        //               / 3DIGIT              ; UN M.49 code
        if ( segLen == 2 || segLen == 3 ) {
            // Region
            langtag.region0 = idx;
            langtag.region1 = idx+segLen;
            if ( segLen == 2 )
                InternalLangTag.checkAlpha(string, N, langtag.region0, langtag.region1);
            else
                InternalLangTag.checkDigits(string, N, langtag.region0, langtag.region1);
            noteSegment("region", string, langtag.region0, langtag.region1);

            // Move on.
            idx = segmentNextStart(N, idx, idx2);
            idx2 = segmentNextFinish(string, N, idx);
            segLen = segmentLength(N, idx, idx2);
        }
        // -- End region

        // ---- variant
        // variant       = 5*8alphanum         ; registered variants
        //               / (DIGIT 3alphanum)
        for ( ;; ) {
            if ( segLen >= 5 && segLen <= 8) {
                // variant 5*8alphanum
                if ( langtag.variant0 == -1 )
                    langtag.variant0 = idx;
                langtag.variant1 = idx+segLen;
                InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1);
                noteSegment("variant", string, langtag.variant0, langtag.variant1);
                // Move on.
                idx = segmentNextStart(N, idx, idx2);
                idx2 = segmentNextFinish(string, N, idx);
                segLen = segmentLength(N, idx, idx2);
                continue;
            }

            if ( segLen == 4 ) {
                // variant
                // DIGIT 3alphanum
                char ch = string.charAt(idx);
                if ( ch >= '0' || ch <= '9' ) {
                    if ( langtag.variant0 == -1 )
                        langtag.variant0 = idx;
                    langtag.variant1 = idx+segLen;
                    InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1);
                    noteSegment("variant", string, langtag.variant0, langtag.variant1);
                }
                // Move on.
                idx = segmentNextStart(N, idx, idx2);
                idx2 = segmentNextFinish(string, N, idx);
                segLen = segmentLength(N, idx, idx2);
                continue;
            }
            break;
        }
        // -- End variant

        // ---- extension and private use
        // extension     = singleton 1*("-" (2*8alphanum))
        // privateuse    = "x" 1*("-" (1*8alphanum))
        boolean inPrivateUseSubtag = false;
        Set<Character> extSingletons = null; new HashSet<>();
        while ( segLen == 1 ) {
            char singleton = string.charAt(idx);
            if ( singleton == 'x' || singleton == 'X' ) {
                inPrivateUseSubtag = true;
                break;
            }
            if ( extSingletons == null ) {
                extSingletons = new HashSet<>();
                extSingletons.add(singleton);
            } else {
                boolean newEntry = extSingletons.add(singleton);
                if ( ! newEntry )
                    InternalLangTag.error("Duplicate extension singleton: '"+singleton+"'");
            }

            if ( langtag.extension0 == -1 )
                langtag.extension0 = idx;
            // Extension.
            // 2*8 alphanum
            int idxExtStart = idx+segLen;
            int idxEndExtra = maybeSubtags(string, N, idxExtStart, 2, 8);

            // Expecting at least one subtag.
            if ( idxExtStart == idxEndExtra )
                InternalLangTag.error("Ill-formed extension");

            if ( idxEndExtra > idxExtStart )
                idx2 = idxEndExtra;
            langtag.extension1 = idx2;
            InternalLangTag.checkAlphaNumMinus(string, N, langtag.extension0, langtag.extension1);

            noteSegment("extension", string, langtag.extension0, langtag.extension1);
            // Move on.
            idx = segmentNextStart(N, idx, idx2);
            idx2 = segmentNextFinish(string, N, idx);
            segLen = segmentLength(N, idx, idx2);
            if ( segLen == 0 )
                InternalLangTag.error("Ill-formed extension. Trailing dash.");
        }

        // ---- private use
        if ( inPrivateUseSubtag ) {
            langtag.privateuse0 = idx;
            // privateuse    = "x" 1*("-" (1*8alphanum))
            int idxPrivateUseStart = idx+segLen;
            int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart, 1, 8);

            // Expecting at least one subtag.
            if ( idxPrivateUseStart == idxPrivateUseEnd )
                InternalLangTag.error("Ill-formed private use component");

            if ( idxPrivateUseEnd > idxPrivateUseStart )
                idx2 = idxPrivateUseEnd;
            langtag.privateuse1 = idx2;
            InternalLangTag.checkAlphaNumMinus(string, N, langtag.privateuse0, langtag.privateuse1);

            noteSegment("private use", string, langtag.privateuse0, langtag.privateuse1);
            // Private use runs to end of string. But do checking.
            // Move on.
            idx = segmentNextStart(N, idx, idx2);
            idx2 = segmentNextFinish(string, N, idx);
            segLen = segmentLength(N, idx, idx2);
            if ( segLen == 0 )
                InternalLangTag.error("Ill-formed private use subtag. Trailing dash.");
        }

        // -- End extension and privateuse

        // Did we process everything? No segment: idx == -1 idx2 == -1  seglen == -1

        if ( idx != -1 && idx < N )
            InternalLangTag.error("Trailing characters: '%s'", string.substring(idx));
        if ( idx2 >= 0 )
            InternalLangTag.error("Bad string: '%s'", string);
        return langtag;
    }