in src/java/org/apache/nutch/crawl/CrawlDbReader.java [1033:1151]
public int run(String[] args) throws IOException, InterruptedException,
ClassNotFoundException, Exception {
@SuppressWarnings("resource")
CrawlDbReader dbr = new CrawlDbReader();
if (args.length < 2) {
System.err.println(
"Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url> | -listen <port>)");
System.err
.println("\t<crawldb>\tdirectory name where crawldb is located");
System.err
.println("\t-stats [-sort] \tprint overall statistics to System.out");
System.err.println("\t\t[-sort]\tlist status sorted by host");
System.err.println(
"\t-dump <out_dir> [-format normal|csv|crawldb|json]\tdump the whole db to a text file in <out_dir>");
System.err.println("\t\t[-format csv]\tdump in Csv format");
System.err.println(
"\t\t[-format normal]\tdump in standard format (default option)");
System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
System.err.println("\t\t[-format json]\tdump in JSON Lines format");
System.err.println("\t\t[-regex <expr>]\tfilter records with expression");
System.err.println("\t\t[-retry <num>]\tminimum retry count");
System.err.println(
"\t\t[-status <status>]\tfilter records by CrawlDatum status");
System.err.println(
"\t\t[-expr <expr>]\tJexl expression to execute for this record");
System.err.println(
"\t\t[-sample <fraction>]\tOnly process a random sample with this ratio");
System.err
.println("\t-url <url>\tprint information on <url> to System.out");
System.err
.println("\t-listen <port> [-keepClientCnxOpen]\tlisten on <port> for URLs and");
System.err
.println("\t\t\tsend information about <url> back");
System.err.println(
"\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
System.err
.println("\t\t[<min>]\tskip records with scores below this value.");
System.err.println("\t\t\tThis can significantly improve performance.");
return -1;
}
String param = null;
String crawlDb = args[0];
this.crawlDb = crawlDb;
int numConsumed = 0;
Configuration config = getConf();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
boolean toSort = false;
if (i < args.length - 1 && "-sort".equals(args[i + 1])) {
toSort = true;
i++;
}
dbr.processStatJob(crawlDb, config, toSort);
} else if (args[i].equals("-dump")) {
param = args[++i];
String format = "normal";
String regex = null;
Integer retry = null;
String status = null;
String expr = null;
Float sample = null;
for (int j = i + 1; j < args.length; j++) {
if (args[j].equals("-format")) {
format = args[++j];
i = i + 2;
}
if (args[j].equals("-regex")) {
regex = args[++j];
i = i + 2;
}
if (args[j].equals("-retry")) {
retry = Integer.parseInt(args[++j]);
i = i + 2;
}
if (args[j].equals("-status")) {
status = args[++j];
i = i + 2;
}
if (args[j].equals("-expr")) {
expr = args[++j];
i = i + 2;
}
if (args[j].equals("-sample")) {
sample = Float.parseFloat(args[++j]);
i = i + 2;
}
}
dbr.processDumpJob(crawlDb, param, config, format, regex, status, retry,
expr, sample);
} else if (args[i].equals("-url")) {
param = args[++i];
StringBuilder output = new StringBuilder();
dbr.readUrl(crawlDb, param, config, output);
System.out.print(output);
} else if (args[i].equals("-topN")) {
param = args[++i];
long topN = Long.parseLong(param);
param = args[++i];
float min = 0.0f;
if (i < args.length - 1) {
min = Float.parseFloat(args[++i]);
}
dbr.processTopNJob(crawlDb, topN, min, param, config);
} else if ((numConsumed = super.parseArgs(args, i)) > 0) {
i += numConsumed - 1;
} else {
System.err.println("\nError: wrong argument " + args[i]);
return -1;
}
}
if (numConsumed > 0) {
// Start listening
return super.run();
}
return 0;
}