java/de/jflex/ucd_generator/ucd/UnicodeData.java (230 lines of code) (raw):
/*
* Copyright (C) 2009-2013 Steve Rowe <sarowe@gmail.com>
* Copyright (C) 2019-2020 Google, LLC.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package de.jflex.ucd_generator.ucd;
import static com.google.common.base.Preconditions.checkState;
import static de.jflex.ucd_generator.ucd.PropertyNames.NORMALIZED_GENERAL_CATEGORY;
import static de.jflex.ucd_generator.ucd.PropertyNames.NORMALIZED_SCRIPT;
import static java.util.Arrays.asList;
import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSortedMap;
import de.jflex.ucd.CodepointRange;
import de.jflex.ucd.Versions;
import de.jflex.version.Version;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
public class UnicodeData {
private final PropertyNames propertyNames = new PropertyNames();
private final PropertyValues propertyValues = new PropertyValues();
/** Maps Unicode property values to the associated set of code point ranges. */
private final PropertyValueIntervals propertyValueIntervals =
new PropertyValueIntervals(propertyValues);
private final CaselessMatches caselessMatches = new CaselessMatches();
private int maximumCodePoint;
private final Version version;
public UnicodeData(Version version) {
this.version = version;
}
public void addCaselessMatches(
int codePoint, String uppercaseMapping, String lowercaseMapping, String titlecaseMapping) {
caselessMatches.addCaselessMatches(
codePoint, uppercaseMapping, lowercaseMapping, titlecaseMapping);
}
public void maximumCodePoint(int maximumCodePoint) {
this.maximumCodePoint = maximumCodePoint;
}
public Version version() {
return version;
}
public String getCanonicalPropertyName(String propertyAlias) {
return propertyNames.getCanonicalPropertyName(propertyAlias);
}
public void addPropertyAlias(String alias, String normalizedLongName) {
propertyNames.putPropertyAlias(normalizedLongName, alias);
}
public void addPropertyValueAliases(
String normalizedPropertyName, String normalizedPropertyValue, Set<String> aliases) {
propertyValues.addPropertyValueAliases(
normalizedPropertyName, normalizedPropertyValue, aliases);
}
public Collection<String> getPropertyValueAliases(String propName, String propValue) {
return propertyValues.getPropertyValueAliases(propName, propValue);
}
public void copyPropertyValueAliases(String sourceProperty, String destProperty) {
propertyValues.copyPropertyValueAliases(sourceProperty, destProperty);
propertyNames.putPropertyAlias(destProperty, destProperty);
}
public Collection<String> getPropertyAliases(String propName) {
return propertyNames.getPropertyAliases(propName);
}
public void addBinaryPropertyInterval(String propertyName, int start, int end) {
propertyName = propertyNames.getCanonicalPropertyName(propertyName);
propertyValueIntervals.addBinaryPropertyInterval(propertyName, start, end);
}
public void addBinaryPropertyInterval(String propertyName, CodepointRange interval) {
addBinaryPropertyInterval(propertyName, interval.start(), interval.end());
}
public void addEnumPropertyInterval(String propName, String propValue, int start, int end) {
propName = propertyNames.getCanonicalPropertyName(propName);
propertyValueIntervals.addEnumPropertyInterval(propName, propValue, start, end);
}
public Set<String> usedBinaryProperties() {
return propertyValueIntervals.usedBinaryProperties;
}
public ImmutableMultimap<String, String> usedEnumeratedProperties() {
return propertyValueIntervals.usedEnumeratedProperties();
}
public boolean hasUsedEnumeratedProperty(String category) {
return propertyValueIntervals.hasUsedEnumeratedProperty(category);
}
public ImmutableList<CodepointRange> getPropertyValueIntervals(String propName) {
return propertyValueIntervals.getRanges(propertyNames.getCanonicalPropertyName(propName));
}
public int maximumCodePoint() {
return maximumCodePoint;
}
/** Returns the code point range by property. */
public ImmutableSortedMap<String, CodepointRangeSet> intervals() {
ImmutableSortedMap<String, CodepointRangeSet> map = propertyValueIntervals.asSortedMap();
return ImmutableSortedMap.<String, CodepointRangeSet>naturalOrder()
.putAll(map.entrySet())
.build();
}
public ImmutableList<Map.Entry<String, String>> usedPropertyValueAliases() {
return ImmutableList.<Map.Entry<String, String>>builder()
.addAll(computeUsedPropertyValueAliases().entrySet())
.build();
}
private ImmutableSortedMap<String, String> computeUsedPropertyValueAliases() {
ImmutableSortedMap.Builder<String, String> usedPropertyValueAliases =
ImmutableSortedMap.naturalOrder();
for (String binaryProperty : usedBinaryProperties()) {
for (String nameAlias : getPropertyAliases(binaryProperty)) {
if (!Objects.equals(nameAlias, binaryProperty)) {
usedPropertyValueAliases.put(nameAlias, binaryProperty);
}
}
}
ImmutableMultimap<String, String> usedEnumProperties =
ImmutableMultimap.<String, String>builder()
.putAll(usedEnumeratedProperties())
.put(NORMALIZED_GENERAL_CATEGORY, "lc")
.build();
for (String propName : usedEnumProperties.keySet()) {
for (String propValue : usedEnumProperties.get(propName)) {
String canonicalValue = propName + '=' + propValue;
Collection<String> propertyValueAliases = getPropertyValueAliases(propName, propValue);
// Add value-only aliases for General Category and Script properties.
if (Objects.equals(propName, NORMALIZED_SCRIPT)
|| Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY)) {
canonicalValue = propValue;
for (String valueAlias : propertyValueAliases) {
if (!Objects.equals(valueAlias, propValue)) {
usedPropertyValueAliases.put(valueAlias, propValue);
}
}
}
for (String nameAlias : getPropertyAliases(propName)) {
if (nameAlias.equals("blk") && version.equals(Versions.VERSION_3_2)) {
// TODO(regisd) Can we remove this hack?
// Ugly hack https://github.com/jflex-de/jflex/pull/828#issuecomment-749690037
continue;
}
for (String valueAlias : propertyValueAliases) {
// Both property names and values have self-aliases; when generating
// all possible alias combinations, exclude the one that is the same
// as the full property name + full property value, unless the
// property is General Category or Script.
if (Objects.equals(propName, NORMALIZED_SCRIPT)
|| Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY)
|| !(Objects.equals(nameAlias, propName)
&& Objects.equals(valueAlias, propValue))) {
String alias = nameAlias + '=' + valueAlias;
usedPropertyValueAliases.put(alias, canonicalValue);
}
}
}
}
}
return usedPropertyValueAliases.build();
}
public String getCanonicalPropertyValueName(String propName, String propValue) {
return propertyValues.getCanonicalValueName(propName, propValue);
}
public int maxCaselessMatchPartitionSize() {
return caselessMatches.maxCaselessMatchPartitionSize();
}
public ImmutableCollection<SortedSet<Integer>> uniqueCaselessMatchPartitions() {
return caselessMatches.uniqueCaselessMatchPartitions();
}
public void addCompatibilityProperties() {
if (Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_1_1) == 0) {
addCompatibilityHexDigit_1_1();
addCompatibilityAlphaNumeric_1_1();
}
// add xdigit
// UTR#18: \p{xdigit} = [\p{gc=Decimal_Number}\p{Hex_Digit}]
// \p{gc=Decimal_Number} = \p{Nd} (available in all versions)
addCompatibilityProperty("xdigit", asList("nd", "hexdigit"));
// add alnum
// UTR#18: \p{alnum} = [\p{alpha}\p{digit}]
// \p{alpha} = \p{Alphabetic} (available in all versions except 1.1)
addCompatibilityProperty("alnum", asList("alphabetic", "nd"));
// UTR#18: \p{blank} = [\p{Whitespace}
// -- [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}
// \p{gc=Line_Separator} \p{gc=Paragraph_Separator}]]
propertyValueIntervals.addAllRanges("blank", createBlankSet());
// UTR#18: \p{graph} = [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
propertyValueIntervals.addAllRanges("graph", createGraphSet());
// UTR#18: \p{print} = [\p{graph}\p{blank} -- \p{cntrl}]
// \p{cntrl} = \p{gc=Control} = \p{gc=Cc} = \p{Cc}
propertyValueIntervals.addAllRanges("print", createPrintSet());
}
/**
* Add "Hex_Digit" for Unicode 1.1, since the property was introduced in Unicode 2.0, Hex_Digit
* contains {@code 0-9 A-F}, fullwidth and halfwidth, upper and lowercase. <code>\p{Nd}}</code>
* contains all required digit forms, so no need to add them here Unicode 1.1 doesn't define
* HALFWIDTH latin letters (or digits).
*/
private void addCompatibilityHexDigit_1_1() {
propertyValueIntervals.addAllRanges(
"xdigit",
ImmutableList.of(
CodepointRange.create('A', 'F'),
CodepointRange.create('a', 'f'),
// FF21..FF26;FULLWIDTH LATIN CAPITAL LETTER A..F
CodepointRange.create(0xFF21, 0xFF26),
// FF41..FF46;FULLWIDTH LATIN SMALL LETTER A..F
CodepointRange.create(0xFF41, 0xFF46)));
}
private void addCompatibilityProperty(
String newPropertyName, List<String> existingPropertyNames) {
CodepointRangeSet.Builder ranges = CodepointRangeSet.builder();
for (String p : existingPropertyNames) {
ranges.addAllImmutable(propertyValueIntervals.getRanges(p));
}
propertyValueIntervals.addAllRanges(newPropertyName, ranges.build().ranges());
}
/**
* For Unicode 1.1, substitute "Letter" (L) for "Alphabetic". <code>
* \p{L} = [\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}]</code>
*/
private void addCompatibilityAlphaNumeric_1_1() {
addCompatibilityProperty("alnum", asList("ll", "lu", "lt", "lm", "lo"));
}
private ImmutableList<CodepointRange> createBlankSet() {
CodepointRangeSet.Builder ranges = CodepointRangeSet.builder();
ImmutableList<CodepointRange> whitespaceRanges = getWhitespaceRange();
ranges.addAllImmutable(whitespaceRanges);
// Subtract: [\N{LF}\N{VT}\N{FF}\N{CR}] = [U+000A-U+000D]
ranges.substract(CodepointRange.create(0xA, 0xD));
// Subtract: \N{NEL}
ranges.substract(CodepointRange.create(0x85, 0x85));
ranges.substractAll(propertyValueIntervals.getRanges("zl")); // \p{gc=Line_Separator}
ranges.substractAll(propertyValueIntervals.getRanges("zp")); // \p{gc=Paragraph_Separator}
return ranges.build().ranges();
}
private ImmutableList<CodepointRange> createGraphSet() {
CodepointRangeSet.Builder ranges = CodepointRangeSet.builder();
ranges.add(MutableCodepointRange.create(0x0, maximumCodePoint));
ranges.substractAll(getWhitespaceRange());
ranges.substractAll(propertyValueIntervals.getRanges("cc")); // \p{gc=Control}
ranges.substractAll(propertyValueIntervals.getRanges("cn")); // \p{gc=Unassigned}
ranges.substract(CodepointRange.create(0xD800, 0xDFFF));
return ranges.build().ranges();
}
private ImmutableList<CodepointRange> createPrintSet() {
CodepointRangeSet.Builder ranges = CodepointRangeSet.builder();
ranges.addAllImmutable(propertyValueIntervals.getRanges("graph"));
ranges.addAllImmutable(propertyValueIntervals.getRanges("blank"));
ranges.substractAll(propertyValueIntervals.getRanges("cc")); // \p{gc=Control}
return ranges.build().ranges();
}
private ImmutableList<CodepointRange> getWhitespaceRange() {
ImmutableList<CodepointRange> whitespaceRanges = propertyValueIntervals.getRanges("whitespace");
if (whitespaceRanges.isEmpty()) {
checkState(
Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_1_1) == 0,
"No whitespace property in Unicode %s. Was PropList parsed?",
version);
// For Unicode 1.1, substitute "Space_separator" (Zs) for "Whitespace"
whitespaceRanges = propertyValueIntervals.getRanges("zs");
}
return whitespaceRanges;
}
public boolean codePointInProperty(int codepoint, String propName) {
return propertyValueIntervals.codePointInProperty(codepoint, propName);
}
/**
* Workaround to remove incorrect codepoint in {@code block=arabicpresentationformsb} property, in
* Unicode 2.0.
*
* <p>Character U+FEFF is assigned to two different blocks.
*
* <pre>{@code
* FE70; FEFF; Arabic Presentation Forms-B
* [...]
* FEFF; FEFF; Specials
* }</pre>
*
* <p>Since the single char in the second range (U+FEFF) is not an Arabic character, but rather
* the zero-width no-break space char, the FE70..FEFF block should be shortened to exclude this
* char. This reflects the correction made in all following Unicode versions.
*
* <p>See https://github.com/jflex-de/jflex/issues/835
*/
public void hackUnicode_2_0() {
if (Version.MAJOR_MINOR_COMPARATOR.compare(version, Versions.VERSION_2_0) == 0) {
propertyValueIntervals.removeEnumPropertyPoint("Block", "arabicpresentationformsb", 0xfeff);
}
}
}