public static void main()

in community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java [146:284]


  public static void main(String[] args) throws IOException {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
        .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
        .withShortName("o").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
        abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
        "The field in the index").withShortName("f").create();

    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
        abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
        "The field in the index containing the index.  If null, then the Lucene internal doc "
            + "id is used which is prone to error if the underlying index changes").create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
        abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
        "The output of the dictionary").withShortName("t").create();

    Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false).withArgument(
        abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription(
        "The output of the dictionary as sequence file").withShortName("st").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
        abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
        "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();

    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
        abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
        "The delimiter for outputting the dictionary").withShortName("l").create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
        abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
        "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  "
            + "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
        abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
        "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
        .withShortName("m").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
        abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
        "The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
        abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
        "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
            + "  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();

    Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
        abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
        "The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
            + "analyzer used strips out all terms in the target field. This percentage is expressed as a value "
            + "between 0 and 1. The default is 0.").withShortName("err").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
        .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
        outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
        .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
        .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {

        CommandLineUtil.printHelp(group);
        return;
      }

      if (cmdLine.hasOption(inputOpt)) { // Lucene case
        Driver luceneDriver = new Driver();
        luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());

        if (cmdLine.hasOption(maxOpt)) {
          luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
        }

        if (cmdLine.hasOption(weightOpt)) {
          luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
        }

        luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());

        if (cmdLine.hasOption(minDFOpt)) {
          luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
        }

        if (cmdLine.hasOption(maxDFPercentOpt)) {
          luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
        }

        if (cmdLine.hasOption(powerOpt)) {
          String power = cmdLine.getValue(powerOpt).toString();
          if ("INF".equals(power)) {
            luceneDriver.setNorm(Double.POSITIVE_INFINITY);
          } else {
            luceneDriver.setNorm(Double.parseDouble(power));
          }
        }

        if (cmdLine.hasOption(idFieldOpt)) {
          luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
        }

        if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
          luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
        }

        luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());

        luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t");

        luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());

        if (cmdLine.hasOption(seqDictOutOpt)) {
          luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
        }

        luceneDriver.dumpVectors();
      }
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }