in src/java/org/apache/nutch/tools/FreeGenerator.java [147:233]
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println(
"Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]");
System.err
.println("\tinputDir\tinput directory containing one or more input files.");
System.err
.println("\t \tEach text file contains a list of URLs, one URL per line");
System.err
.println("\tsegmentsDir\toutput directory, where new segment will be created");
System.err.println("\t-filter \trun current URLFilters on input URLs");
System.err
.println("\t-normalize\trun current URLNormalizers on input URLs");
System.err.println(
"\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks");
return -1;
}
boolean filter = false;
boolean normalize = false;
int numFetchers = -1;
if (args.length > 2) {
for (int i = 2; i < args.length; i++) {
if (args[i].equals("-filter")) {
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
} else if ("-numFetchers".equals(args[i])) {
numFetchers = Integer.parseInt(args[i + 1]);
i++;
} else {
LOG.error("Unknown argument: " + args[i] + ", exiting ...");
return -1;
}
}
}
StopWatch stopWatch = new StopWatch();
stopWatch.start();
LOG.info("FreeGenerator: starting");
Job job = Job.getInstance(getConf(), "Nutch FreeGenerator: " + args[0]);
Configuration conf = job.getConfiguration();
conf.setBoolean(FILTER_KEY, filter);
conf.setBoolean(NORMALIZE_KEY, normalize);
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setInputFormatClass(TextInputFormat.class);
job.setJarByClass(FG.class);
job.setMapperClass(FG.FGMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Generator.SelectorEntry.class);
job.setPartitionerClass(URLPartitioner.class);
job.setReducerClass(FG.FGReducer.class);
String segName = Generator.generateSegmentName();
if (numFetchers == -1) {
/* for politeness create exactly one partition per fetch task */
numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
}
if ("local".equals(conf.get("mapreduce.framework.name"))
&& numFetchers != 1) {
// override
LOG.info(
"FreeGenerator: running in local mode, generating exactly one partition.");
numFetchers = 1;
}
job.setNumReduceTasks(numFetchers);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setSortComparatorClass(Generator.HashComparator.class);
FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName,
CrawlDatum.GENERATE_DIR_NAME)));
try {
boolean success = job.waitForCompletion(true);
if (!success) {
String message = NutchJob.getJobFailureLogMessage("FreeGenerator", job);
LOG.error(message);
throw new RuntimeException(message);
}
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("FAILED: " + StringUtils.stringifyException(e));
return -1;
}
stopWatch.stop();
LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime(
TimeUnit.MILLISECONDS));
return 0;
}