public static boolean createDictionary()

in DictionaryAnnotator/src/main/java/org/apache/uima/annotator/dict_annot/dictionary/impl/DictionaryCreator.java [208:371]


   public static boolean createDictionary(String inputFile, String encoding,
         String outputFile, String language, String tokenizerFile,
         String tokenTypeStr, String separatorChar) throws Exception {

      String outputSeparatorChar = OUTPUT_SEPARATOR_CHAR;

      // check input file command line argument
      File inFile = new File(inputFile);
      if (!inFile.canRead()) {
         throw new Exception("Error: Input file " + inputFile
               + " cannot be read!");
      }

      // check tokenizer pear command line arguments
      AnalysisEngine ae = null;
      Type tokenType = null;
      CAS cas = null;
      File tempDir = null;

      if (tokenizerFile != null) {
         // if a tokenizer is specified, check if the file can be read
         File pearFile = new File(tokenizerFile);
         if (!pearFile.canRead()) {
            throw new Exception("Error: Tokenizer file " + tokenizerFile
                  + " cannot be read!");
         }
         if (tokenTypeStr == null) {
            throw new Exception("Error: Tokenizer tokenType not specified");
         }
         try {
            // create temp directory to install PEAR
            tempDir = new File(System.getProperty("java.io.tmpdir"), "~tokenizer_temp_install");
            tempDir.deleteOnExit();
            tempDir.mkdir();

            // Install PEAR package
            PackageBrowser instPear = PackageInstaller.installPackage(tempDir,
                  pearFile, true);

            // Create analysis engine from the installed PEAR package
            XMLInputSource in = new XMLInputSource(instPear
                  .getComponentPearDescPath());
            ResourceSpecifier specifier = UIMAFramework.getXMLParser()
                  .parseResourceSpecifier(in);
            ae = UIMAFramework.produceAnalysisEngine(specifier);

            // create CAS and initialize tokenType
            cas = ae.newCAS();
            tokenType = cas.getTypeSystem().getType(tokenTypeStr);

         } catch (Exception ex) {
            throw new Exception("Error creating tokenizer: " + ex.getMessage(),
                  ex);
         }
      }

      // check separator char command line argument
      if (separatorChar == null) {
         // use default separator character
         separatorChar = SEPARATOR_CHAR;
      } else {
         // if set, use specified separator char also as output separator
         outputSeparatorChar = separatorChar;
      }

      // initialize input and output files
      BufferedReader reader = new BufferedReader(new InputStreamReader(
            new FileInputStream(inputFile), encoding));
      BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
            new FileOutputStream(outputFile), "UTF-8"));

      // write dictionary XML lead in
      writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
      writer
            .write("<dictionary xmlns=\"http://incubator.apache.org/uima\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"dictionary.xsd\">\n");
      writer.write("<typeCollection>\n");
      writer
            .write("<dictionaryMetaData caseNormalization=\"true\" multiWordEntries=\"true\" multiWordSeparator=\""
                  + outputSeparatorChar + "\"/>\n");
      if (language != null) {
         writer.write("<languageId>" + language + "</languageId>\n");
      }
      writer.write("<typeDescription>\n");
      writer.write("<typeName> ADD DICTIONARY OUTPUT TYPE HERE</typeName>\n");
      writer.write("</typeDescription>\n");
      writer.write("<entries>\n");

      // start adding dictionary entries
      String line = reader.readLine();
      while (line != null) {

         // multi workd string buffer - contains the tokens for each entry
         StringBuffer multiWordString = new StringBuffer();

         // tokenize entry line
         if (ae != null) { // use tokenizer
            cas.setDocumentText(line);
            if (language != null) {
               cas.setDocumentLanguage(language);
            }
            // tokenize line
            ae.process(cas);

            // read results
            FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
            while (it.hasNext()) {
               multiWordString.append(((AnnotationFS) it.next())
                     .getCoveredText());
               multiWordString.append(outputSeparatorChar);
            }
            cas.reset();

         } else { // use separator char

            StringTokenizer tokenizer = new StringTokenizer(line, separatorChar);
            while (tokenizer.hasMoreTokens()) {
               multiWordString.append(tokenizer.nextToken());
               multiWordString.append(outputSeparatorChar);
            }
         }

         // trim string and remove separator char at the end
         String multiWordTokenString = multiWordString.toString().trim();
         if (multiWordTokenString.endsWith(outputSeparatorChar)) {
            int separatorLength = outputSeparatorChar.length();
            int length = multiWordTokenString.length();
            multiWordTokenString = multiWordTokenString.substring(0, length
                  - separatorLength);
         }

         // replace XML entities
         multiWordTokenString = replaceXMLEntities(multiWordTokenString);

         // write dictionary entry to XML
         writer.write("<entry>\n");
         writer.write("<key>" + multiWordTokenString + "</key>\n");
         writer.write("</entry>\n");

         // get next line
         line = reader.readLine();
      } // all dictionary lines are processed
      reader.close();

      // write dictionary XML lead out
      writer.write("</entries>\n");
      writer.write("</typeCollection>\n");
      writer.write("</dictionary>\n");
      writer.close();

      // try to delete PEAR temp dir
      if (tempDir != null) {
         FileUtils.deleteRecursive(tempDir);
         if (tempDir != null) {
            List files = FileUtils.getFiles(tempDir, true);
            if (files != null) {
               for (int i = 0; i < files.size(); i++) {
                  ((File) files.get(i)).deleteOnExit();
               }
            }
         }
      }

      return true;
   }