java/de/jflex/ucd_generator/scanner/UcdScanner.java (243 lines of code) (raw):

/* * Copyright (C) 2020 Google, LLC. * SPDX-License-Identifier: BSD-3-Clause */ package de.jflex.ucd_generator.scanner; import static com.google.common.base.Preconditions.checkNotNull; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import com.google.common.collect.Sets.SetView; import com.google.common.io.Files; import de.jflex.ucd.UcdFileType; import de.jflex.ucd.UcdVersion; import de.jflex.ucd.Versions; import de.jflex.ucd_generator.ucd.UnicodeData; import de.jflex.version.Version; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.charset.StandardCharsets; public class UcdScanner { private static final boolean DEBUG = false; private final UcdVersion ucdVersion; final UnicodeData unicodeData; public UcdScanner(UcdVersion ucdVersion) { this.ucdVersion = ucdVersion; this.unicodeData = new UnicodeData(ucdVersion.version()); } /** Scans all UCD data files. */ public UnicodeData scan() throws UcdScannerException { try { scanPropertyAliases(); scanPropertyValueAliases(); cloneScriptsToScriptExtensions(); scanUnicodeData(); scanPropList(); scanDerivedCoreProperties(); scanScripts(); scanScriptExtensions(); scanBlocks(); scanLineBreak(); scanGraphemeBreakProperty(); scanSentenceBreakProperty(); scanWordBreakProperty(); scanDerivedAge(); scanEmoji(); unicodeData.addCompatibilityProperties(); unicodeData.hackUnicode_2_0(); return unicodeData; } catch (Throwable thr) { String cause = (thr.getMessage() != null) ? thr.getMessage() : "Unknown error"; throw new UcdScannerException( "Failed to emit Unicode properties for version " + ucdVersion.version() + " : " + cause, thr); } } void scanPropertyAliases() throws IOException { File file = ucdVersion.getFile(UcdFileType.PropertyAliases); if (file != null) { assertFileExists(file); PropertyAliasesScanner scanner = new PropertyAliasesScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } void scanPropertyValueAliases() throws IOException { File file = ucdVersion.getFile(UcdFileType.PropertyValueAliases); if (file != null) { assertFileExists(file); PropertyValueAliasesScanner scanner = new PropertyValueAliasesScanner( Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } private void cloneScriptsToScriptExtensions() { // Clone Script/sc property value aliases => Script_Extensions/scx String scPropName = unicodeData.getCanonicalPropertyName("Script"); String scxPropName = unicodeData.getCanonicalPropertyName("Script_Extensions"); unicodeData.copyPropertyValueAliases(scPropName, scxPropName); } void scanUnicodeData() throws IOException { File file = ucdVersion.getFile(UcdFileType.UnicodeData); checkNotNull(file, "UnicodeData.txt not defined in UCD %s", ucdVersion); assertFileExists(file); UnicodeDataScanner scanner = new UnicodeDataScanner( Files.newReader(file, StandardCharsets.UTF_8), ucdVersion, unicodeData); scanner.scan(); } void scanPropList() throws IOException { if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) { File file = ucdVersion.getFile(UcdFileType.PropList); if (file != null) { ArchaicPropListScanner scanner = new ArchaicPropListScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } else { scanBinaryProperties(unicodeData, ucdVersion.getFile(UcdFileType.PropList)); } } void scanDerivedCoreProperties() throws IOException { scanBinaryProperties(unicodeData, ucdVersion.getFile(UcdFileType.DerivedCoreProperties)); } void scanScripts() throws IOException { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.Scripts), /*defaultPropertyName=*/ "Script", getDefaultScriptValue()); } /** * Returns the default Script property value. * * <ul> * <li>From Unicode 5.0 onward, the default Script property value is "Unknown". * <li>Prior to Unicode 5.0, the default Script property value is "Common". * <li>Prior to Unicode 3.1, Scripts(-X.X.X).txt did not exist. * </ul> */ private String getDefaultScriptValue() { if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_5_0) < 0) { return "Common"; } return "Unknown"; } void scanScriptExtensions() throws IOException { File file = ucdVersion.getFile(UcdFileType.ScriptExtensions); if (file != null) { assertFileExists(file); ScriptExtensionsScanner scanner = new ScriptExtensionsScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } void scanBlocks() throws IOException { if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) { File file = ucdVersion.getFile(UcdFileType.Blocks); if (file != null) { ArchaicBlocksScanner scanner = new ArchaicBlocksScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } else { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.Blocks), /*defaultPropertyName=*/ "Block", "No_Block"); } } void scanLineBreak() throws IOException { if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_3_1) < 0) { File file = ucdVersion.getFile(UcdFileType.LineBreak); if (file != null) { assertFileExists(file); ArchaicLineBreakScanner scanner = new ArchaicLineBreakScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } else { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.LineBreak), /*defaultPropertyName=*/ "Line_Break", "XX"); } } void scanGraphemeBreakProperty() throws IOException { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.GraphemeBreakProperty), "Grapheme_Cluster_Break", "Other"); } void scanSentenceBreakProperty() throws IOException { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.SentenceBreakProperty), "Sentence_Break", "Other"); } void scanWordBreakProperty() throws IOException { scanEnumeratedProperty( unicodeData, ucdVersion.getFile(UcdFileType.WordBreakProperty), "Word_break", "Other"); } void scanDerivedAge() throws IOException { File file = ucdVersion.getFile(UcdFileType.DerivedAge); if (file != null) { assertFileExists(file); DerivedAgeScanner scanner = new DerivedAgeScanner( Files.newReader(file, StandardCharsets.UTF_8), unicodeData, "Age", unicodeData.maximumCodePoint()); scanner.scan(); } } private void scanEmoji() throws IOException { if (Version.MAJOR_MINOR_COMPARATOR.compare(ucdVersion.version(), Versions.VERSION_8_0) < 0) { // Versions before 8.0 didn't have Emoji return; } File file = checkNotNull( ucdVersion.getFile(UcdFileType.Emoji), "Expected Emoji for version %s but known files are: %s", ucdVersion.version(), ucdVersion.files()); assertFileExists(file); BinaryPropertiesFileScanner scanner = new BinaryPropertiesFileScanner(Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } /** Scans any binary properties file. */ private static void scanBinaryProperties(UnicodeData unicodeData, File file) throws IOException { if (file != null) { assertFileExists(file); BinaryPropertiesFileScanner scanner = new BinaryPropertiesFileScanner( Files.newReader(file, StandardCharsets.UTF_8), unicodeData); scanner.scan(); } } /** Scans any enumerated properties file. */ private static void scanEnumeratedProperty( UnicodeData unicodeData, File file, String defaultPropertyName, String defaultPropertyValue) throws IOException { if (file != null) { assertFileExists(file); EnumeratedPropertyFileScanner scanner = new EnumeratedPropertyFileScanner( Files.newReader(file, StandardCharsets.UTF_8), unicodeData, defaultPropertyName, defaultPropertyValue); ImmutableSet<String> before = DEBUG ? ImmutableSet.copyOf(ImmutableList.copyOf(unicodeData.intervals().keySet())) : ImmutableSet.of(); scanner.scan(); if (DEBUG) { SetView<String> diff = Sets.difference( ImmutableSet.copyOf(ImmutableList.copyOf(unicodeData.intervals().keySet())), before); System.out.println(diff); } } } private static void assertFileExists(File file) throws FileNotFoundException { if (!file.isFile()) { throw new FileNotFoundException(file.getAbsolutePath()); } } public UcdVersion ucdVersion() { return ucdVersion; } }