in src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java [575:718]
public int run(String[] args) throws Exception {
Option helpOpt = new Option("h", "help", false, "show this help message.");
// argument options
@SuppressWarnings("static-access")
Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg()
.withDescription(
"output directory (which will be created) to host the CBOR data.")
.create("outputDir");
// WARC format
Option warcOpt = new Option("warc", "export to a WARC file");
@SuppressWarnings("static-access")
Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
.withDescription("the segment or directory containing segments to use").create("segment");
// create mimetype and gzip options
@SuppressWarnings("static-access")
Option mimeOpt = OptionBuilder.isRequired(false).withArgName("mimetype")
.hasArgs().withDescription(
"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
.create("mimetype");
@SuppressWarnings("static-access")
Option gzipOpt = OptionBuilder.withArgName("gzip").hasArg(false)
.withDescription(
"an optional flag indicating whether to additionally gzip the data.")
.create("gzip");
@SuppressWarnings("static-access")
Option keyPrefixOpt = OptionBuilder.withArgName("keyPrefix").hasArg(true)
.withDescription("an optional prefix for key in the output format.")
.create("keyPrefix");
@SuppressWarnings("static-access")
Option simpleDateFormatOpt = OptionBuilder.withArgName("SimpleDateFormat")
.hasArg(false).withDescription(
"an optional format for timestamp in GMT epoch milliseconds.")
.create("SimpleDateFormat");
@SuppressWarnings("static-access")
Option epochFilenameOpt = OptionBuilder.withArgName("epochFilename")
.hasArg(false)
.withDescription("an optional format for output filename.")
.create("epochFilename");
@SuppressWarnings("static-access")
Option jsonArrayOpt = OptionBuilder.withArgName("jsonArray").hasArg(false)
.withDescription("an optional format for JSON output.")
.create("jsonArray");
@SuppressWarnings("static-access")
Option reverseKeyOpt = OptionBuilder.withArgName("reverseKey").hasArg(false)
.withDescription("an optional format for key value in JSON output.")
.create("reverseKey");
@SuppressWarnings("static-access")
Option extensionOpt = OptionBuilder.withArgName("extension").hasArg(true)
.withDescription("an optional file extension for output documents.")
.create("extension");
@SuppressWarnings("static-access")
Option sizeOpt = OptionBuilder.withArgName("warcSize").hasArg(true)
.withType(Number.class)
.withDescription("an optional file size in bytes for the WARC file(s)")
.create("warcSize");
@SuppressWarnings("static-access")
Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true)
.withDescription("an optional linkdb parameter to include inlinks in dump files")
.isRequired(false)
.create("linkdb");
// create the options
Options options = new Options();
options.addOption(helpOpt);
options.addOption(outputOpt);
options.addOption(segOpt);
// create mimetypes and gzip options
options.addOption(warcOpt);
options.addOption(mimeOpt);
options.addOption(gzipOpt);
// create keyPrefix option
options.addOption(keyPrefixOpt);
// create simpleDataFormat option
options.addOption(simpleDateFormatOpt);
options.addOption(epochFilenameOpt);
options.addOption(jsonArrayOpt);
options.addOption(reverseKeyOpt);
options.addOption(extensionOpt);
options.addOption(sizeOpt);
options.addOption(linkDbOpt);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("outputDir") || (!line
.hasOption("segment"))) {
HelpFormatter formatter = new HelpFormatter();
formatter
.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
return 0;
}
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
boolean gzip = line.hasOption("gzip");
boolean epochFilename = line.hasOption("epochFilename");
String keyPrefix = line.getOptionValue("keyPrefix", "");
boolean simpleDateFormat = line.hasOption("SimpleDateFormat");
boolean jsonArray = line.hasOption("jsonArray");
boolean reverseKey = line.hasOption("reverseKey");
String extension = line.getOptionValue("extension", "");
boolean warc = line.hasOption("warc");
long warcSize = 0;
if (line.getParsedOptionValue("warcSize") != null) {
warcSize = (Long) line.getParsedOptionValue("warcSize");
}
String linkdbPath = line.getOptionValue("linkdb");
File linkdb = linkdbPath == null ? null : new File(linkdbPath);
CommonCrawlConfig config = new CommonCrawlConfig();
config.setKeyPrefix(keyPrefix);
config.setSimpleDateFormat(simpleDateFormat);
config.setJsonArray(jsonArray);
config.setReverseKey(reverseKey);
config.setCompressed(gzip);
config.setWarcSize(warcSize);
config.setOutputDir(line.getOptionValue("outputDir"));
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+ "]: does not exist, creating it.");
if (!outputDir.mkdirs())
throw new Exception(
"Unable to create: [" + outputDir.getAbsolutePath() + "]");
}
CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename,
extension, warc);
} catch (Exception e) {
LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils
.stringifyException(e));
e.printStackTrace();
return -1;
}
return 0;
}