public int run()

in src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java [575:718]


  public int run(String[] args) throws Exception {
    Option helpOpt = new Option("h", "help", false, "show this help message.");
    // argument options
    @SuppressWarnings("static-access")
    Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg()
        .withDescription(
            "output directory (which will be created) to host the CBOR data.")
        .create("outputDir");
    // WARC format
    Option warcOpt = new Option("warc", "export to a WARC file");

    @SuppressWarnings("static-access")
    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
        .withDescription("the segment or directory containing segments to use").create("segment");
    // create mimetype and gzip options
    @SuppressWarnings("static-access")
    Option mimeOpt = OptionBuilder.isRequired(false).withArgName("mimetype")
        .hasArgs().withDescription(
            "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
        .create("mimetype");
    @SuppressWarnings("static-access")
    Option gzipOpt = OptionBuilder.withArgName("gzip").hasArg(false)
        .withDescription(
            "an optional flag indicating whether to additionally gzip the data.")
        .create("gzip");
    @SuppressWarnings("static-access")
    Option keyPrefixOpt = OptionBuilder.withArgName("keyPrefix").hasArg(true)
        .withDescription("an optional prefix for key in the output format.")
        .create("keyPrefix");
    @SuppressWarnings("static-access")
    Option simpleDateFormatOpt = OptionBuilder.withArgName("SimpleDateFormat")
        .hasArg(false).withDescription(
            "an optional format for timestamp in GMT epoch milliseconds.")
        .create("SimpleDateFormat");
    @SuppressWarnings("static-access")
    Option epochFilenameOpt = OptionBuilder.withArgName("epochFilename")
        .hasArg(false)
        .withDescription("an optional format for output filename.")
        .create("epochFilename");
    @SuppressWarnings("static-access")
    Option jsonArrayOpt = OptionBuilder.withArgName("jsonArray").hasArg(false)
        .withDescription("an optional format for JSON output.")
        .create("jsonArray");
    @SuppressWarnings("static-access")
    Option reverseKeyOpt = OptionBuilder.withArgName("reverseKey").hasArg(false)
        .withDescription("an optional format for key value in JSON output.")
        .create("reverseKey");
    @SuppressWarnings("static-access")
    Option extensionOpt = OptionBuilder.withArgName("extension").hasArg(true)
        .withDescription("an optional file extension for output documents.")
        .create("extension");
    @SuppressWarnings("static-access")
    Option sizeOpt = OptionBuilder.withArgName("warcSize").hasArg(true)
        .withType(Number.class)
        .withDescription("an optional file size in bytes for the WARC file(s)")
        .create("warcSize");
    @SuppressWarnings("static-access")
    Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true)
        .withDescription("an optional linkdb parameter to include inlinks in dump files")
        .isRequired(false)
        .create("linkdb");

    // create the options
    Options options = new Options();
    options.addOption(helpOpt);
    options.addOption(outputOpt);
    options.addOption(segOpt);
    // create mimetypes and gzip options
    options.addOption(warcOpt);
    options.addOption(mimeOpt);
    options.addOption(gzipOpt);
    // create keyPrefix option
    options.addOption(keyPrefixOpt);
    // create simpleDataFormat option
    options.addOption(simpleDateFormatOpt);
    options.addOption(epochFilenameOpt);
    options.addOption(jsonArrayOpt);
    options.addOption(reverseKeyOpt);
    options.addOption(extensionOpt);
    options.addOption(sizeOpt);
    options.addOption(linkDbOpt);

    CommandLineParser parser = new GnuParser();
    try {
      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("outputDir") || (!line
          .hasOption("segment"))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter
            .printHelp(CommonCrawlDataDumper.class.getName(), options, true);
        return 0;
      }

      File outputDir = new File(line.getOptionValue("outputDir"));
      File segmentRootDir = new File(line.getOptionValue("segment"));
      String[] mimeTypes = line.getOptionValues("mimetype");
      boolean gzip = line.hasOption("gzip");
      boolean epochFilename = line.hasOption("epochFilename");

      String keyPrefix = line.getOptionValue("keyPrefix", "");
      boolean simpleDateFormat = line.hasOption("SimpleDateFormat");
      boolean jsonArray = line.hasOption("jsonArray");
      boolean reverseKey = line.hasOption("reverseKey");
      String extension = line.getOptionValue("extension", "");
      boolean warc = line.hasOption("warc");
      long warcSize = 0;

      if (line.getParsedOptionValue("warcSize") != null) {
        warcSize = (Long) line.getParsedOptionValue("warcSize");
      }
      String linkdbPath = line.getOptionValue("linkdb");
      File linkdb = linkdbPath == null ? null : new File(linkdbPath);

      CommonCrawlConfig config = new CommonCrawlConfig();
      config.setKeyPrefix(keyPrefix);
      config.setSimpleDateFormat(simpleDateFormat);
      config.setJsonArray(jsonArray);
      config.setReverseKey(reverseKey);
      config.setCompressed(gzip);
      config.setWarcSize(warcSize);
      config.setOutputDir(line.getOptionValue("outputDir"));

      if (!outputDir.exists()) {
        LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
            + "]: does not exist, creating it.");
        if (!outputDir.mkdirs())
          throw new Exception(
              "Unable to create: [" + outputDir.getAbsolutePath() + "]");
      }

      CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);

      dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename,
          extension, warc);

    } catch (Exception e) {
      LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils
          .stringifyException(e));
      e.printStackTrace();
      return -1;
    }

    return 0;
  }