java/de/jflex/ucd_generator/scanner/UcdScanner.java (243 lines of code) (raw):
/*
* Copyright (C) 2020 Google, LLC.
* SPDX-License-Identifier: BSD-3-Clause
*/
package de.jflex.ucd_generator.scanner;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import com.google.common.io.Files;
import de.jflex.ucd.UcdFileType;
import de.jflex.ucd.UcdVersion;
import de.jflex.ucd.Versions;
import de.jflex.ucd_generator.ucd.UnicodeData;
import de.jflex.version.Version;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class UcdScanner {
private static final boolean DEBUG = false;
private final UcdVersion ucdVersion;
final UnicodeData unicodeData;
public UcdScanner(UcdVersion ucdVersion) {
this.ucdVersion = ucdVersion;
this.unicodeData = new UnicodeData(ucdVersion.version());
}
/** Scans all UCD data files. */
public UnicodeData scan() throws UcdScannerException {
try {
scanPropertyAliases();
scanPropertyValueAliases();
cloneScriptsToScriptExtensions();
scanUnicodeData();
scanPropList();
scanDerivedCoreProperties();
scanScripts();
scanScriptExtensions();
scanBlocks();
scanLineBreak();
scanGraphemeBreakProperty();
scanSentenceBreakProperty();
scanWordBreakProperty();
scanDerivedAge();
scanEmoji();
unicodeData.addCompatibilityProperties();
unicodeData.hackUnicode_2_0();
return unicodeData;
} catch (Throwable thr) {
String cause = (thr.getMessage() != null) ? thr.getMessage() : "Unknown error";
throw new UcdScannerException(
"Failed to emit Unicode properties for version " + ucdVersion.version() + " : " + cause,
thr);
}
}
void scanPropertyAliases() throws IOException {
File file = ucdVersion.getFile(UcdFileType.PropertyAliases);
if (file != null) {
assertFileExists(file);
PropertyAliasesScanner scanner =
new PropertyAliasesScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
}
void scanPropertyValueAliases() throws IOException {
File file = ucdVersion.getFile(UcdFileType.PropertyValueAliases);
if (file != null) {
assertFileExists(file);
PropertyValueAliasesScanner scanner =
new PropertyValueAliasesScanner(
Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
}
private void cloneScriptsToScriptExtensions() {
// Clone Script/sc property value aliases => Script_Extensions/scx
String scPropName = unicodeData.getCanonicalPropertyName("Script");
String scxPropName = unicodeData.getCanonicalPropertyName("Script_Extensions");
unicodeData.copyPropertyValueAliases(scPropName, scxPropName);
}
void scanUnicodeData() throws IOException {
File file = ucdVersion.getFile(UcdFileType.UnicodeData);
checkNotNull(file, "UnicodeData.txt not defined in UCD %s", ucdVersion);
assertFileExists(file);
UnicodeDataScanner scanner =
new UnicodeDataScanner(
Files.newReader(file, StandardCharsets.UTF_8), ucdVersion, unicodeData);
scanner.scan();
}
void scanPropList() throws IOException {
if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) {
File file = ucdVersion.getFile(UcdFileType.PropList);
if (file != null) {
ArchaicPropListScanner scanner =
new ArchaicPropListScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
} else {
scanBinaryProperties(unicodeData, ucdVersion.getFile(UcdFileType.PropList));
}
}
void scanDerivedCoreProperties() throws IOException {
scanBinaryProperties(unicodeData, ucdVersion.getFile(UcdFileType.DerivedCoreProperties));
}
void scanScripts() throws IOException {
scanEnumeratedProperty(
unicodeData,
ucdVersion.getFile(UcdFileType.Scripts),
/*defaultPropertyName=*/ "Script",
getDefaultScriptValue());
}
/**
* Returns the default Script property value.
*
* <ul>
* <li>From Unicode 5.0 onward, the default Script property value is "Unknown".
* <li>Prior to Unicode 5.0, the default Script property value is "Common".
* <li>Prior to Unicode 3.1, Scripts(-X.X.X).txt did not exist.
* </ul>
*/
private String getDefaultScriptValue() {
if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_5_0) < 0) {
return "Common";
}
return "Unknown";
}
void scanScriptExtensions() throws IOException {
File file = ucdVersion.getFile(UcdFileType.ScriptExtensions);
if (file != null) {
assertFileExists(file);
ScriptExtensionsScanner scanner =
new ScriptExtensionsScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
}
void scanBlocks() throws IOException {
if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) {
File file = ucdVersion.getFile(UcdFileType.Blocks);
if (file != null) {
ArchaicBlocksScanner scanner =
new ArchaicBlocksScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
} else {
scanEnumeratedProperty(
unicodeData,
ucdVersion.getFile(UcdFileType.Blocks),
/*defaultPropertyName=*/ "Block",
"No_Block");
}
}
void scanLineBreak() throws IOException {
if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) {
File file = ucdVersion.getFile(UcdFileType.LineBreak);
if (file != null) {
assertFileExists(file);
ArchaicLineBreakScanner scanner =
new ArchaicLineBreakScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
} else {
scanEnumeratedProperty(
unicodeData,
ucdVersion.getFile(UcdFileType.LineBreak),
/*defaultPropertyName=*/ "Line_Break",
"XX");
}
}
void scanGraphemeBreakProperty() throws IOException {
scanEnumeratedProperty(
unicodeData,
ucdVersion.getFile(UcdFileType.GraphemeBreakProperty),
"Grapheme_Cluster_Break",
"Other");
}
void scanSentenceBreakProperty() throws IOException {
scanEnumeratedProperty(
unicodeData,
ucdVersion.getFile(UcdFileType.SentenceBreakProperty),
"Sentence_Break",
"Other");
}
void scanWordBreakProperty() throws IOException {
scanEnumeratedProperty(
unicodeData, ucdVersion.getFile(UcdFileType.WordBreakProperty), "Word_break", "Other");
}
void scanDerivedAge() throws IOException {
File file = ucdVersion.getFile(UcdFileType.DerivedAge);
if (file != null) {
assertFileExists(file);
DerivedAgeScanner scanner =
new DerivedAgeScanner(
Files.newReader(file, StandardCharsets.UTF_8),
unicodeData,
"Age",
unicodeData.maximumCodePoint());
scanner.scan();
}
}
private void scanEmoji() throws IOException {
if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_8_0) < 0) {
// Versions before 8.0 didn't have Emoji
return;
}
File file =
checkNotNull(
ucdVersion.getFile(UcdFileType.Emoji),
"Expected Emoji for version %s but known files are: %s",
ucdVersion.version(),
ucdVersion.files());
assertFileExists(file);
BinaryPropertiesFileScanner scanner =
new BinaryPropertiesFileScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
/** Scans any binary properties file. */
private static void scanBinaryProperties(UnicodeData unicodeData, File file) throws IOException {
if (file != null) {
assertFileExists(file);
BinaryPropertiesFileScanner scanner =
new BinaryPropertiesFileScanner(
Files.newReader(file, StandardCharsets.UTF_8), unicodeData);
scanner.scan();
}
}
/** Scans any enumerated properties file. */
private static void scanEnumeratedProperty(
UnicodeData unicodeData, File file, String defaultPropertyName, String defaultPropertyValue)
throws IOException {
if (file != null) {
assertFileExists(file);
EnumeratedPropertyFileScanner scanner =
new EnumeratedPropertyFileScanner(
Files.newReader(file, StandardCharsets.UTF_8),
unicodeData,
defaultPropertyName,
defaultPropertyValue);
ImmutableSet<String> before =
DEBUG
? ImmutableSet.copyOf(ImmutableList.copyOf(unicodeData.intervals().keySet()))
: ImmutableSet.of();
scanner.scan();
if (DEBUG) {
SetView<String> diff =
Sets.difference(
ImmutableSet.copyOf(ImmutableList.copyOf(unicodeData.intervals().keySet())),
before);
System.out.println(diff);
}
}
}
private static void assertFileExists(File file) throws FileNotFoundException {
if (!file.isFile()) {
throw new FileNotFoundException(file.getAbsolutePath());
}
}
public UcdVersion ucdVersion() {
return ucdVersion;
}
}