public int run()

in src/java/org/apache/nutch/util/CrawlCompletionStats.java [69:188]


  public int run(String[] args) throws Exception {
    Option helpOpt = new Option("h", "help", false, "Show this message");
    @SuppressWarnings("static-access")
    Option inDirs = OptionBuilder
        .withArgName("inputDirs")
        .isRequired()
        .withDescription("Comma separated list of crawldb directories (e.g., \"./crawl1/crawldb,./crawl2/crawldb\")")
        .hasArgs()
        .create("inputDirs");
    @SuppressWarnings("static-access")
    Option outDir = OptionBuilder
        .withArgName("outputDir")
        .isRequired()
        .withDescription("Output directory where results should be dumped")
        .hasArgs()
        .create("outputDir");
    @SuppressWarnings("static-access")
    Option modeOpt = OptionBuilder
        .withArgName("mode")
        .isRequired()
        .withDescription("Set statistics gathering mode (by 'host' or by 'domain')")
        .hasArgs()
        .create("mode");
    @SuppressWarnings("static-access")
    Option numReducers = OptionBuilder
        .withArgName("numReducers")
        .withDescription("Optional number of reduce jobs to use. Defaults to 1")
        .hasArgs()
        .create("numReducers");

    Options options = new Options();
    options.addOption(helpOpt);
    options.addOption(inDirs);
    options.addOption(outDir);
    options.addOption(modeOpt);
    options.addOption(numReducers);

    CommandLineParser parser = new GnuParser();
    CommandLine cli;

    try {
      cli = parser.parse(options, args);
    } catch (MissingOptionException e) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("CrawlCompletionStats", options, true);
      return 1;
    }

    if (cli.hasOption("help")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("CrawlCompletionStats", options, true);
      return 1;
    }

    String inputDir = cli.getOptionValue("inputDirs");
    String outputDir = cli.getOptionValue("outputDir");

    int numOfReducers = 1;
    if (cli.hasOption("numReducers")) {
      numOfReducers = Integer.parseInt(args[3]);
    }

    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    LOG.info("CrawlCompletionStats: starting");

    int mode = 0;
    String jobName = "Nutch CrawlCompletionStats: ";
    if (cli.getOptionValue("mode").equals("host")) {
      jobName = jobName + "Host statistics";
      mode = MODE_HOST;
    } else if (cli.getOptionValue("mode").equals("domain")) {
      jobName = jobName + "Domain statistics";
      mode = MODE_DOMAIN;
    } 

    Configuration conf = getConf();
    conf.setInt("domain.statistics.mode", mode);
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(CrawlCompletionStats.class);

    String[] inputDirsSpecs = inputDir.split(",");
    for (int i = 0; i < inputDirsSpecs.length; i++) {
      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
    }

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(CrawlCompletionStatsMapper.class);
    job.setReducerClass(CrawlCompletionStatsReducer.class);
    job.setCombinerClass(CrawlCompletionStatsCombiner.class);
    job.setNumReduceTasks(numOfReducers);

    try {
      boolean success = job.waitForCompletion(true);
      if (!success) {
        String message = NutchJob.getJobFailureLogMessage(jobName, job);
        LOG.error(message);
        // throw exception so that calling routine can exit with error
        throw new RuntimeException(message);
      }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
      LOG.error(jobName + " job failed");
      throw e;
    }

    stopWatch.stop();
    LOG.info("CrawlCompletionStats: finished, elapsed: {} ms", stopWatch.getTime(
        TimeUnit.MILLISECONDS));
    return 0;
  }