in DictionaryAnnotator/src/main/java/org/apache/uima/annotator/dict_annot/dictionary/impl/DictionaryCreator.java [208:371]
public static boolean createDictionary(String inputFile, String encoding,
String outputFile, String language, String tokenizerFile,
String tokenTypeStr, String separatorChar) throws Exception {
String outputSeparatorChar = OUTPUT_SEPARATOR_CHAR;
// check input file command line argument
File inFile = new File(inputFile);
if (!inFile.canRead()) {
throw new Exception("Error: Input file " + inputFile
+ " cannot be read!");
}
// check tokenizer pear command line arguments
AnalysisEngine ae = null;
Type tokenType = null;
CAS cas = null;
File tempDir = null;
if (tokenizerFile != null) {
// if a tokenizer is specified, check if the file can be read
File pearFile = new File(tokenizerFile);
if (!pearFile.canRead()) {
throw new Exception("Error: Tokenizer file " + tokenizerFile
+ " cannot be read!");
}
if (tokenTypeStr == null) {
throw new Exception("Error: Tokenizer tokenType not specified");
}
try {
// create temp directory to install PEAR
tempDir = new File(System.getProperty("java.io.tmpdir"), "~tokenizer_temp_install");
tempDir.deleteOnExit();
tempDir.mkdir();
// Install PEAR package
PackageBrowser instPear = PackageInstaller.installPackage(tempDir,
pearFile, true);
// Create analysis engine from the installed PEAR package
XMLInputSource in = new XMLInputSource(instPear
.getComponentPearDescPath());
ResourceSpecifier specifier = UIMAFramework.getXMLParser()
.parseResourceSpecifier(in);
ae = UIMAFramework.produceAnalysisEngine(specifier);
// create CAS and initialize tokenType
cas = ae.newCAS();
tokenType = cas.getTypeSystem().getType(tokenTypeStr);
} catch (Exception ex) {
throw new Exception("Error creating tokenizer: " + ex.getMessage(),
ex);
}
}
// check separator char command line argument
if (separatorChar == null) {
// use default separator character
separatorChar = SEPARATOR_CHAR;
} else {
// if set, use specified separator char also as output separator
outputSeparatorChar = separatorChar;
}
// initialize input and output files
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(inputFile), encoding));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputFile), "UTF-8"));
// write dictionary XML lead in
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
writer
.write("<dictionary xmlns=\"http://incubator.apache.org/uima\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"dictionary.xsd\">\n");
writer.write("<typeCollection>\n");
writer
.write("<dictionaryMetaData caseNormalization=\"true\" multiWordEntries=\"true\" multiWordSeparator=\""
+ outputSeparatorChar + "\"/>\n");
if (language != null) {
writer.write("<languageId>" + language + "</languageId>\n");
}
writer.write("<typeDescription>\n");
writer.write("<typeName> ADD DICTIONARY OUTPUT TYPE HERE</typeName>\n");
writer.write("</typeDescription>\n");
writer.write("<entries>\n");
// start adding dictionary entries
String line = reader.readLine();
while (line != null) {
// multi workd string buffer - contains the tokens for each entry
StringBuffer multiWordString = new StringBuffer();
// tokenize entry line
if (ae != null) { // use tokenizer
cas.setDocumentText(line);
if (language != null) {
cas.setDocumentLanguage(language);
}
// tokenize line
ae.process(cas);
// read results
FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
while (it.hasNext()) {
multiWordString.append(((AnnotationFS) it.next())
.getCoveredText());
multiWordString.append(outputSeparatorChar);
}
cas.reset();
} else { // use separator char
StringTokenizer tokenizer = new StringTokenizer(line, separatorChar);
while (tokenizer.hasMoreTokens()) {
multiWordString.append(tokenizer.nextToken());
multiWordString.append(outputSeparatorChar);
}
}
// trim string and remove separator char at the end
String multiWordTokenString = multiWordString.toString().trim();
if (multiWordTokenString.endsWith(outputSeparatorChar)) {
int separatorLength = outputSeparatorChar.length();
int length = multiWordTokenString.length();
multiWordTokenString = multiWordTokenString.substring(0, length
- separatorLength);
}
// replace XML entities
multiWordTokenString = replaceXMLEntities(multiWordTokenString);
// write dictionary entry to XML
writer.write("<entry>\n");
writer.write("<key>" + multiWordTokenString + "</key>\n");
writer.write("</entry>\n");
// get next line
line = reader.readLine();
} // all dictionary lines are processed
reader.close();
// write dictionary XML lead out
writer.write("</entries>\n");
writer.write("</typeCollection>\n");
writer.write("</dictionary>\n");
writer.close();
// try to delete PEAR temp dir
if (tempDir != null) {
FileUtils.deleteRecursive(tempDir);
if (tempDir != null) {
List files = FileUtils.getFiles(tempDir, true);
if (files != null) {
for (int i = 0; i < files.size(); i++) {
((File) files.get(i)).deleteOnExit();
}
}
}
}
return true;
}