public int run()

in community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java [62:260]


  public int run(String[] args) throws Exception {
    /**
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c", "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
        + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude "
        + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
        + " conjunction with -sort", false);
    addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter."
        + "  Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
      return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
      pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
      FileStatus[] inputPaths = fs.globStatus(input);
      pathArr = new Path[inputPaths.length];
      int i = 0;
      for (FileStatus fstatus : inputPaths) {
        pathArr[i++] = fstatus.getPath();
      }
    }


    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
      log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
      String dictFile = getOption("dictionary");
      switch (dictionaryType) {
        case "text":
          dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
          break;
        case "sequencefile":
          dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
          break;
        default:
          //TODO: support Lucene's FST as a dictionary type
          throw new IOException("Invalid dictionary type: " + dictionaryType);
      }
    }

    Set<String> filters;
    if (hasOption("filter")) {
      filters = Sets.newHashSet(getOptions("filter"));
    } else {
      filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
      shouldClose = true;
      log.info("Output file: {}", output);
      Files.createParentDirs(output);
      writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
      shouldClose = false;
      writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
      boolean printKey = hasOption("printKey");
      if (useCSV && dictionary != null) {
        writer.write("#");
        for (int j = 0; j < dictionary.length; j++) {
          writer.write(dictionary[j]);
          if (j < dictionary.length - 1) {
            writer.write(',');
          }
        }
        writer.write('\n');
      }
      Long numItems = null;
      if (hasOption("numItems")) {
        numItems = Long.parseLong(getOption("numItems"));
        if (quiet) {
          writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
        }
      }
      int maxIndexesPerVector = hasOption("vectorSize")
          ? Integer.parseInt(getOption("vectorSize"))
          : Integer.MAX_VALUE;
      long itemCount = 0;
      int fileCount = 0;
      for (Path path : pathArr) {
        if (numItems != null && numItems <= itemCount) {
          break;
        }
        if (quiet) {
          log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
        }
        SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
        Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
        long i = 0;
        while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
          Pair<Writable, Writable> record = iterator.next();
          Writable keyWritable = record.getFirst();
          Writable valueWritable = record.getSecond();
          if (printKey) {
            Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
            writer.write(notTheVectorWritable.toString());
            writer.write('\t');
          }
          Vector vector;
          try {
            vector = ((VectorWritable)
                (transposeKeyValue ? keyWritable : valueWritable)).get();
          } catch (ClassCastException e) {
            if ((transposeKeyValue ? keyWritable : valueWritable)
                instanceof WeightedPropertyVectorWritable) {
              vector =
                  ((WeightedPropertyVectorWritable)
                      (transposeKeyValue ? keyWritable : valueWritable)).getVector();
            } else {
              throw e;
            }
          }
          if (filters == null
              || !(vector instanceof NamedVector)
              || filters.contains(((NamedVector) vector).getName())) {
            if (sizeOnly) {
              if (vector instanceof NamedVector) {
                writer.write(((NamedVector) vector).getName());
                writer.write(":");
              } else {
                writer.write(String.valueOf(i++));
                writer.write(":");
              }
              writer.write(String.valueOf(vector.size()));
              writer.write('\n');
            } else if (nameOnly) {
              if (vector instanceof NamedVector) {
                writer.write(((NamedVector) vector).getName());
                writer.write('\n');
              }
            } else {
              String fmtStr;
              if (useCSV) {
                fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
              } else {
                fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                    sortVectors);
              }
              writer.write(fmtStr);
              writer.write('\n');
            }
            itemCount++;
          }
        }
      }
      writer.flush();
    } finally {
      if (shouldClose) {
        Closeables.close(writer, false);
      }
    }

    return 0;
  }