java/de/jflex/ucd_generator/ucd/UnicodeData.java (230 lines of code) (raw):

/* * Copyright (C) 2009-2013 Steve Rowe <sarowe@gmail.com> * Copyright (C) 2019-2020 Google, LLC. * * SPDX-License-Identifier: BSD-3-Clause */ package de.jflex.ucd_generator.ucd; import static com.google.common.base.Preconditions.checkState; import static de.jflex.ucd_generator.ucd.PropertyNames.NORMALIZED_GENERAL_CATEGORY; import static de.jflex.ucd_generator.ucd.PropertyNames.NORMALIZED_SCRIPT; import static java.util.Arrays.asList; import com.google.common.collect.ImmutableCollection; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSortedMap; import de.jflex.ucd.CodepointRange; import de.jflex.ucd.Versions; import de.jflex.version.Version; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.SortedSet; public class UnicodeData { private final PropertyNames propertyNames = new PropertyNames(); private final PropertyValues propertyValues = new PropertyValues(); /** Maps Unicode property values to the associated set of code point ranges. */ private final PropertyValueIntervals propertyValueIntervals = new PropertyValueIntervals(propertyValues); private final CaselessMatches caselessMatches = new CaselessMatches(); private int maximumCodePoint; private final Version version; public UnicodeData(Version version) { this.version = version; } public void addCaselessMatches( int codePoint, String uppercaseMapping, String lowercaseMapping, String titlecaseMapping) { caselessMatches.addCaselessMatches( codePoint, uppercaseMapping, lowercaseMapping, titlecaseMapping); } public void maximumCodePoint(int maximumCodePoint) { this.maximumCodePoint = maximumCodePoint; } public Version version() { return version; } public String getCanonicalPropertyName(String propertyAlias) { return propertyNames.getCanonicalPropertyName(propertyAlias); } public void addPropertyAlias(String alias, String normalizedLongName) { propertyNames.putPropertyAlias(normalizedLongName, alias); } public void addPropertyValueAliases( String normalizedPropertyName, String normalizedPropertyValue, Set<String> aliases) { propertyValues.addPropertyValueAliases( normalizedPropertyName, normalizedPropertyValue, aliases); } public Collection<String> getPropertyValueAliases(String propName, String propValue) { return propertyValues.getPropertyValueAliases(propName, propValue); } public void copyPropertyValueAliases(String sourceProperty, String destProperty) { propertyValues.copyPropertyValueAliases(sourceProperty, destProperty); propertyNames.putPropertyAlias(destProperty, destProperty); } public Collection<String> getPropertyAliases(String propName) { return propertyNames.getPropertyAliases(propName); } public void addBinaryPropertyInterval(String propertyName, int start, int end) { propertyName = propertyNames.getCanonicalPropertyName(propertyName); propertyValueIntervals.addBinaryPropertyInterval(propertyName, start, end); } public void addBinaryPropertyInterval(String propertyName, CodepointRange interval) { addBinaryPropertyInterval(propertyName, interval.start(), interval.end()); } public void addEnumPropertyInterval(String propName, String propValue, int start, int end) { propName = propertyNames.getCanonicalPropertyName(propName); propertyValueIntervals.addEnumPropertyInterval(propName, propValue, start, end); } public Set<String> usedBinaryProperties() { return propertyValueIntervals.usedBinaryProperties; } public ImmutableMultimap<String, String> usedEnumeratedProperties() { return propertyValueIntervals.usedEnumeratedProperties(); } public boolean hasUsedEnumeratedProperty(String category) { return propertyValueIntervals.hasUsedEnumeratedProperty(category); } public ImmutableList<CodepointRange> getPropertyValueIntervals(String propName) { return propertyValueIntervals.getRanges(propertyNames.getCanonicalPropertyName(propName)); } public int maximumCodePoint() { return maximumCodePoint; } /** Returns the code point range by property. */ public ImmutableSortedMap<String, CodepointRangeSet> intervals() { ImmutableSortedMap<String, CodepointRangeSet> map = propertyValueIntervals.asSortedMap(); return ImmutableSortedMap.<String, CodepointRangeSet>naturalOrder() .putAll(map.entrySet()) .build(); } public ImmutableList<Map.Entry<String, String>> usedPropertyValueAliases() { return ImmutableList.<Map.Entry<String, String>>builder() .addAll(computeUsedPropertyValueAliases().entrySet()) .build(); } private ImmutableSortedMap<String, String> computeUsedPropertyValueAliases() { ImmutableSortedMap.Builder<String, String> usedPropertyValueAliases = ImmutableSortedMap.naturalOrder(); for (String binaryProperty : usedBinaryProperties()) { for (String nameAlias : getPropertyAliases(binaryProperty)) { if (!Objects.equals(nameAlias, binaryProperty)) { usedPropertyValueAliases.put(nameAlias, binaryProperty); } } } ImmutableMultimap<String, String> usedEnumProperties = ImmutableMultimap.<String, String>builder() .putAll(usedEnumeratedProperties()) .put(NORMALIZED_GENERAL_CATEGORY, "lc") .build(); for (String propName : usedEnumProperties.keySet()) { for (String propValue : usedEnumProperties.get(propName)) { String canonicalValue = propName + '=' + propValue; Collection<String> propertyValueAliases = getPropertyValueAliases(propName, propValue); // Add value-only aliases for General Category and Script properties. if (Objects.equals(propName, NORMALIZED_SCRIPT) || Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY)) { canonicalValue = propValue; for (String valueAlias : propertyValueAliases) { if (!Objects.equals(valueAlias, propValue)) { usedPropertyValueAliases.put(valueAlias, propValue); } } } for (String nameAlias : getPropertyAliases(propName)) { if (nameAlias.equals("blk") && version.equals(Versions.VERSION_3_2)) { // TODO(regisd) Can we remove this hack? // Ugly hack https://github.com/jflex-de/jflex/pull/828#issuecomment-749690037 continue; } for (String valueAlias : propertyValueAliases) { // Both property names and values have self-aliases; when generating // all possible alias combinations, exclude the one that is the same // as the full property name + full property value, unless the // property is General Category or Script. if (Objects.equals(propName, NORMALIZED_SCRIPT) || Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY) || !(Objects.equals(nameAlias, propName) && Objects.equals(valueAlias, propValue))) { String alias = nameAlias + '=' + valueAlias; usedPropertyValueAliases.put(alias, canonicalValue); } } } } } return usedPropertyValueAliases.build(); } public String getCanonicalPropertyValueName(String propName, String propValue) { return propertyValues.getCanonicalValueName(propName, propValue); } public int maxCaselessMatchPartitionSize() { return caselessMatches.maxCaselessMatchPartitionSize(); } public ImmutableCollection<SortedSet<Integer>> uniqueCaselessMatchPartitions() { return caselessMatches.uniqueCaselessMatchPartitions(); } public void addCompatibilityProperties() { if (Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_1_1) == 0) { addCompatibilityHexDigit_1_1(); addCompatibilityAlphaNumeric_1_1(); } // add xdigit // UTR#18: \p{xdigit} = [\p{gc=Decimal_Number}\p{Hex_Digit}] // \p{gc=Decimal_Number} = \p{Nd} (available in all versions) addCompatibilityProperty("xdigit", asList("nd", "hexdigit")); // add alnum // UTR#18: \p{alnum} = [\p{alpha}\p{digit}] // \p{alpha} = \p{Alphabetic} (available in all versions except 1.1) addCompatibilityProperty("alnum", asList("alphabetic", "nd")); // UTR#18: \p{blank} = [\p{Whitespace} // -- [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} // \p{gc=Line_Separator} \p{gc=Paragraph_Separator}]] propertyValueIntervals.addAllRanges("blank", createBlankSet()); // UTR#18: \p{graph} = [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] propertyValueIntervals.addAllRanges("graph", createGraphSet()); // UTR#18: \p{print} = [\p{graph}\p{blank} -- \p{cntrl}] // \p{cntrl} = \p{gc=Control} = \p{gc=Cc} = \p{Cc} propertyValueIntervals.addAllRanges("print", createPrintSet()); } /** * Add "Hex_Digit" for Unicode 1.1, since the property was introduced in Unicode 2.0, Hex_Digit * contains {@code 0-9 A-F}, fullwidth and halfwidth, upper and lowercase. <code>\p{Nd}}</code> * contains all required digit forms, so no need to add them here Unicode 1.1 doesn't define * HALFWIDTH latin letters (or digits). */ private void addCompatibilityHexDigit_1_1() { propertyValueIntervals.addAllRanges( "xdigit", ImmutableList.of( CodepointRange.create('A', 'F'), CodepointRange.create('a', 'f'), // FF21..FF26;FULLWIDTH LATIN CAPITAL LETTER A..F CodepointRange.create(0xFF21, 0xFF26), // FF41..FF46;FULLWIDTH LATIN SMALL LETTER A..F CodepointRange.create(0xFF41, 0xFF46))); } private void addCompatibilityProperty( String newPropertyName, List<String> existingPropertyNames) { CodepointRangeSet.Builder ranges = CodepointRangeSet.builder(); for (String p : existingPropertyNames) { ranges.addAllImmutable(propertyValueIntervals.getRanges(p)); } propertyValueIntervals.addAllRanges(newPropertyName, ranges.build().ranges()); } /** * For Unicode 1.1, substitute "Letter" (L) for "Alphabetic". <code> * \p{L} = [\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}]</code> */ private void addCompatibilityAlphaNumeric_1_1() { addCompatibilityProperty("alnum", asList("ll", "lu", "lt", "lm", "lo")); } private ImmutableList<CodepointRange> createBlankSet() { CodepointRangeSet.Builder ranges = CodepointRangeSet.builder(); ImmutableList<CodepointRange> whitespaceRanges = getWhitespaceRange(); ranges.addAllImmutable(whitespaceRanges); // Subtract: [\N{LF}\N{VT}\N{FF}\N{CR}] = [U+000A-U+000D] ranges.substract(CodepointRange.create(0xA, 0xD)); // Subtract: \N{NEL} ranges.substract(CodepointRange.create(0x85, 0x85)); ranges.substractAll(propertyValueIntervals.getRanges("zl")); // \p{gc=Line_Separator} ranges.substractAll(propertyValueIntervals.getRanges("zp")); // \p{gc=Paragraph_Separator} return ranges.build().ranges(); } private ImmutableList<CodepointRange> createGraphSet() { CodepointRangeSet.Builder ranges = CodepointRangeSet.builder(); ranges.add(MutableCodepointRange.create(0x0, maximumCodePoint)); ranges.substractAll(getWhitespaceRange()); ranges.substractAll(propertyValueIntervals.getRanges("cc")); // \p{gc=Control} ranges.substractAll(propertyValueIntervals.getRanges("cn")); // \p{gc=Unassigned} ranges.substract(CodepointRange.create(0xD800, 0xDFFF)); return ranges.build().ranges(); } private ImmutableList<CodepointRange> createPrintSet() { CodepointRangeSet.Builder ranges = CodepointRangeSet.builder(); ranges.addAllImmutable(propertyValueIntervals.getRanges("graph")); ranges.addAllImmutable(propertyValueIntervals.getRanges("blank")); ranges.substractAll(propertyValueIntervals.getRanges("cc")); // \p{gc=Control} return ranges.build().ranges(); } private ImmutableList<CodepointRange> getWhitespaceRange() { ImmutableList<CodepointRange> whitespaceRanges = propertyValueIntervals.getRanges("whitespace"); if (whitespaceRanges.isEmpty()) { checkState( Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_1_1) == 0, "No whitespace property in Unicode %s. Was PropList parsed?", version); // For Unicode 1.1, substitute "Space_separator" (Zs) for "Whitespace" whitespaceRanges = propertyValueIntervals.getRanges("zs"); } return whitespaceRanges; } public boolean codePointInProperty(int codepoint, String propName) { return propertyValueIntervals.codePointInProperty(codepoint, propName); } /** * Workaround to remove incorrect codepoint in {@code block=arabicpresentationformsb} property, in * Unicode 2.0. * * <p>Character U+FEFF is assigned to two different blocks. * * <pre>{@code * FE70; FEFF; Arabic Presentation Forms-B * [...] * FEFF; FEFF; Specials * }</pre> * * <p>Since the single char in the second range (U+FEFF) is not an Arabic character, but rather * the zero-width no-break space char, the FE70..FEFF block should be shortened to exclude this * char. This reflects the correction made in all following Unicode versions. * * <p>See https://github.com/jflex-de/jflex/issues/835 */ public void hackUnicode_2_0() { if (Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_2_0) == 0) { propertyValueIntervals.removeEnumPropertyPoint("Block", "arabicpresentationformsb", 0xfeff); } } }