in jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java [234:561]
private static LangTagRFC5646 parser(String string) {
// A segment is a sequence of A2ZN characters separated by '-'.
LangTagRFC5646 langtag = new LangTagRFC5646(string);
final int N = string.length();
// @formatter:off
// langtag = language
// ["-" script]
// ["-" region]
// *("-" variant)
// *("-" extension)
// ["-" privateuse]
//
// language = 2*3ALPHA ; shortest ISO 639 code
// ["-" extlang] ; sometimes followed by
// ; extended language subtags
// / 4ALPHA ; or reserved for future use
// / 5*8ALPHA ; or registered language subtag
//
// extlang = 3ALPHA ; selected ISO 639 codes
// *2("-" 3ALPHA) ; permanently reserved
//
// script = 4ALPHA ; ISO 15924 code
//
// region = 2ALPHA ; ISO 3166-1 code
// / 3DIGIT ; UN M.49 code
//
// variant = 5*8alphanum ; registered variants
// / (DIGIT 3alphanum)
//
// extension = singleton 1*("-" (2*8alphanum))
//
// ; Single alphanumerics
// ; "x" reserved for private use
// singleton = DIGIT ; 0 - 9
// / %x41-57 ; A - W
// / %x59-5A ; Y - Z
// / %x61-77 ; a - w
// / %x79-7A ; y - z
//
// privateuse = "x" 1*("-" (1*8alphanum))
// @formatter:on
if ( N == 0 )
InternalLangTag.error("Empty string");
// -------------------
// language = (2*3ALPHA [ extlang ]); shortest ISO 639 code
// / 4ALPHA ; reserved for future use
// / 5*8ALPHA ; registered language subtag
// extlang = 3ALPHA ; selected ISO 639 codes
// *2("-" 3ALPHA) ; permanently reserved
// Grandfathered
// Must check first because the whole string (except "en-GB-oed") is the "language"
if ( grandfathered(string) ) {
// Regular:
// "each tag, in its entirety, represents a language or collection of languages."
//
// Irregular:
// With the exception of "en-GB-oed", which is a
// variant of "en-GB", each of them, in its entirety,
// represents a language.
//
langtag.language0 = 0;
langtag.language1 = N;
langtag.isGrandfathered = true;
// Exception.
if ( string.equalsIgnoreCase("en-GB-oed") ) {
// "oed" is "Oxford English Dictionary spelling"
// Better is the replacement "en-GB-oxendict"
langtag.language0 = 0;
langtag.language1 = 2;
langtag.region0 = 3;
langtag.region1 = 5;
// Non-standard variant.
langtag.variant0 = 6;
langtag.variant1 = N;
}
return langtag;
}
// -- language
int idx = 0;
int idx2 = segmentNextFinish(string, N, idx);
int segLen = segmentLength(N, idx, idx2);
// Private use in the language position.
if ( segLen == 1 ) {
if ( string.startsWith("x-") || string.startsWith("X-") ) {
/*
The primary language subtag is the first subtag in a language tag and
cannot be omitted, with two exceptions:
o The single-character subtag 'x' as the primary subtag indicates
that the language tag consists solely of subtags whose meaning is
defined by private agreement. For example, in the tag "x-fr-CH",
the subtags 'fr' and 'CH' do not represent the French language or
the country of Switzerland (or any other value in the IANA
registry) unless there is a private agreement in place to do so.
See Section 4.6.
*/
langtag.isPrivateUseLanguage = true;
int idxPrivateUseStart = 0;
int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart+segLen, 1, 8);
langtag.privateuse0 = idxPrivateUseStart;
langtag.privateuse1 = idxPrivateUseEnd;
if ( langtag.privateuse1 < N )
InternalLangTag.error("Trailing characters in private langtag: '%s'", string.substring(langtag.privateuse1));
return langtag;
}
// else
InternalLangTag.error("Language part is 1 character: it must be 2-3 characters (4-8 reserved for future use), \"x-\", or a recognized grandfathered tag");
}
if ( segLen > 8 )
InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)");
if ( idx2 < 0 ) {
// language only.
langtag.language0 = 0;
langtag.language1 = N;
InternalLangTag.checkAlpha(string, N, langtag.language0, langtag.language1);
return langtag;
}
if ( idx == idx2 )
InternalLangTag.error("Can not find the language subtag: '%s'", string);
langtag.language0 = idx;
if ( segLen == 2 || segLen == 3 ) {
// -- Language extension subtags/
// language = 2*3ALPHA ; shortest ISO 639 code
// ["-" extlang]
// extlang = 3ALPHA ; selected ISO 639 codes
// *2("-" 3ALPHA) ; permanently reserved
int extStart = idx+segLen;
InternalLangTag.checkAlpha(string, N, langtag.language0, extStart);
// Extensions are 1 to 3 3ALPHA subtags
int extEnd = maybeSubtags(string, N, extStart, 3, 3);
if ( extEnd > extStart ) {
idx2 = extEnd;
InternalLangTag.checkAlphaMinus(string, N, extStart, langtag.language1);
}
} else if ( segLen >= 4 && segLen <= 8 ) {
// / 4ALPHA ; or reserved for future use
// / 5*8ALPHA ; or registered language subtag
// Dubious.
InternalLangTag.checkAlpha(string, N, langtag.language0, idx2);
} else {
InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)");
}
langtag.language1 = idx2;
// Info
noteSegment("language", string, langtag.language0, langtag.language1);
// Move on - next subtag
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
// -- End langtag
// ---- script
// script = 4ALPHA ; ISO 15924 code
if ( segLen == 4 && InternalLangTag.isAlpha(string.charAt(idx)) ) {
// Script
// Not a digit - which is a variant.
// variant = ... / (DIGIT 3alphanum)
int start = idx;
int finish = idx+segLen;
langtag.script0 = idx;
langtag.script1 = idx+segLen;
InternalLangTag.checkAlpha(string, N, langtag.script0, langtag.script1);
noteSegment("script", string, langtag.script0, langtag.script1);
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
}
// -- End script
// ---- region
// region = 2ALPHA ; ISO 3166-1 code
// / 3DIGIT ; UN M.49 code
if ( segLen == 2 || segLen == 3 ) {
// Region
langtag.region0 = idx;
langtag.region1 = idx+segLen;
if ( segLen == 2 )
InternalLangTag.checkAlpha(string, N, langtag.region0, langtag.region1);
else
InternalLangTag.checkDigits(string, N, langtag.region0, langtag.region1);
noteSegment("region", string, langtag.region0, langtag.region1);
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
}
// -- End region
// ---- variant
// variant = 5*8alphanum ; registered variants
// / (DIGIT 3alphanum)
for ( ;; ) {
if ( segLen >= 5 && segLen <= 8) {
// variant 5*8alphanum
if ( langtag.variant0 == -1 )
langtag.variant0 = idx;
langtag.variant1 = idx+segLen;
InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1);
noteSegment("variant", string, langtag.variant0, langtag.variant1);
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
continue;
}
if ( segLen == 4 ) {
// variant
// DIGIT 3alphanum
char ch = string.charAt(idx);
if ( ch >= '0' || ch <= '9' ) {
if ( langtag.variant0 == -1 )
langtag.variant0 = idx;
langtag.variant1 = idx+segLen;
InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1);
noteSegment("variant", string, langtag.variant0, langtag.variant1);
}
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
continue;
}
break;
}
// -- End variant
// ---- extension and private use
// extension = singleton 1*("-" (2*8alphanum))
// privateuse = "x" 1*("-" (1*8alphanum))
boolean inPrivateUseSubtag = false;
Set<Character> extSingletons = null; new HashSet<>();
while ( segLen == 1 ) {
char singleton = string.charAt(idx);
if ( singleton == 'x' || singleton == 'X' ) {
inPrivateUseSubtag = true;
break;
}
if ( extSingletons == null ) {
extSingletons = new HashSet<>();
extSingletons.add(singleton);
} else {
boolean newEntry = extSingletons.add(singleton);
if ( ! newEntry )
InternalLangTag.error("Duplicate extension singleton: '"+singleton+"'");
}
if ( langtag.extension0 == -1 )
langtag.extension0 = idx;
// Extension.
// 2*8 alphanum
int idxExtStart = idx+segLen;
int idxEndExtra = maybeSubtags(string, N, idxExtStart, 2, 8);
// Expecting at least one subtag.
if ( idxExtStart == idxEndExtra )
InternalLangTag.error("Ill-formed extension");
if ( idxEndExtra > idxExtStart )
idx2 = idxEndExtra;
langtag.extension1 = idx2;
InternalLangTag.checkAlphaNumMinus(string, N, langtag.extension0, langtag.extension1);
noteSegment("extension", string, langtag.extension0, langtag.extension1);
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
if ( segLen == 0 )
InternalLangTag.error("Ill-formed extension. Trailing dash.");
}
// ---- private use
if ( inPrivateUseSubtag ) {
langtag.privateuse0 = idx;
// privateuse = "x" 1*("-" (1*8alphanum))
int idxPrivateUseStart = idx+segLen;
int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart, 1, 8);
// Expecting at least one subtag.
if ( idxPrivateUseStart == idxPrivateUseEnd )
InternalLangTag.error("Ill-formed private use component");
if ( idxPrivateUseEnd > idxPrivateUseStart )
idx2 = idxPrivateUseEnd;
langtag.privateuse1 = idx2;
InternalLangTag.checkAlphaNumMinus(string, N, langtag.privateuse0, langtag.privateuse1);
noteSegment("private use", string, langtag.privateuse0, langtag.privateuse1);
// Private use runs to end of string. But do checking.
// Move on.
idx = segmentNextStart(N, idx, idx2);
idx2 = segmentNextFinish(string, N, idx);
segLen = segmentLength(N, idx, idx2);
if ( segLen == 0 )
InternalLangTag.error("Ill-formed private use subtag. Trailing dash.");
}
// -- End extension and privateuse
// Did we process everything? No segment: idx == -1 idx2 == -1 seglen == -1
if ( idx != -1 && idx < N )
InternalLangTag.error("Trailing characters: '%s'", string.substring(idx));
if ( idx2 >= 0 )
InternalLangTag.error("Bad string: '%s'", string);
return langtag;
}