public int run()

in src/java/org/apache/nutch/tools/FreeGenerator.java [147:233]


  public int run(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println(
          "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]");
      System.err
          .println("\tinputDir\tinput directory containing one or more input files.");
      System.err
          .println("\t        \tEach text file contains a list of URLs, one URL per line");
      System.err
          .println("\tsegmentsDir\toutput directory, where new segment will be created");
      System.err.println("\t-filter   \trun current URLFilters on input URLs");
      System.err
          .println("\t-normalize\trun current URLNormalizers on input URLs");
      System.err.println(
          "\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks");
      return -1;
    }
    boolean filter = false;
    boolean normalize = false;
    int numFetchers = -1;
    if (args.length > 2) {
      for (int i = 2; i < args.length; i++) {
        if (args[i].equals("-filter")) {
          filter = true;
        } else if (args[i].equals("-normalize")) {
          normalize = true;
        } else if ("-numFetchers".equals(args[i])) {
          numFetchers = Integer.parseInt(args[i + 1]);
          i++;
        } else {
          LOG.error("Unknown argument: " + args[i] + ", exiting ...");
          return -1;
        }
      }
    }

    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    LOG.info("FreeGenerator: starting");

    Job job = Job.getInstance(getConf(), "Nutch FreeGenerator: " + args[0]);
    Configuration conf = job.getConfiguration();
    conf.setBoolean(FILTER_KEY, filter);
    conf.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);
    job.setJarByClass(FG.class);
    job.setMapperClass(FG.FGMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.FGReducer.class);
    String segName = Generator.generateSegmentName();
    if (numFetchers == -1) {
      /* for politeness create exactly one partition per fetch task */
      numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
    }
    if ("local".equals(conf.get("mapreduce.framework.name"))
        && numFetchers != 1) {
      // override
      LOG.info(
          "FreeGenerator: running in local mode, generating exactly one partition.");
      numFetchers = 1;
    }
    job.setNumReduceTasks(numFetchers);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setSortComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName,
        CrawlDatum.GENERATE_DIR_NAME)));
    try {
      boolean success = job.waitForCompletion(true);
      if (!success) {
        String message = NutchJob.getJobFailureLogMessage("FreeGenerator", job);
        LOG.error(message);
        throw new RuntimeException(message);
      }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
      LOG.error("FAILED: " + StringUtils.stringifyException(e));
      return -1;
    }
    stopWatch.stop();
    LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime(
        TimeUnit.MILLISECONDS));
    return 0;
  }