in src/java/org/apache/nutch/crawl/CrawlDbReader.java [1159:1292]
public Object query(Map<String, String> args, Configuration conf, String type,
String crawlId) throws Exception {
Map<String, Object> results = new HashMap<>();
String crawlDb = crawlId + "/crawldb";
if (type.equalsIgnoreCase("stats")) {
boolean sort = false;
if (args.containsKey("sort")) {
if (args.get("sort").equalsIgnoreCase("true"))
sort = true;
}
TreeMap<String, Writable> stats = processStatJobHelper(crawlDb,
NutchConfiguration.create(), sort);
LongWritable totalCnt = (LongWritable) stats.get("T");
stats.remove("T");
results.put("totalUrls", String.valueOf(totalCnt.get()));
Map<String, Object> statusMap = new HashMap<>();
for (Map.Entry<String, Writable> entry : stats.entrySet()) {
String k = entry.getKey();
long val = 0L;
double fval = 0.0;
if (entry.getValue() instanceof LongWritable) {
val = ((LongWritable) entry.getValue()).get();
} else if (entry.getValue() instanceof FloatWritable) {
fval = ((FloatWritable) entry.getValue()).get();
} else if (entry.getValue() instanceof BytesWritable) {
continue;
}
if (k.equals("scn")) {
results.put("minScore", String.valueOf(fval));
} else if (k.equals("scx")) {
results.put("maxScore", String.valueOf(fval));
} else if (k.equals("sct")) {
results.put("avgScore", String.valueOf((fval / totalCnt.get())));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2) {
@SuppressWarnings("unchecked")
Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap
.get(String.valueOf(code));
Map<String, String> hostValues;
if (individualStatusInfo.containsKey("hostValues")) {
hostValues = (Map<String, String>) individualStatusInfo
.get("hostValues");
} else {
hostValues = new HashMap<>();
individualStatusInfo.put("hostValues", hostValues);
}
hostValues.put(st[2], String.valueOf(val));
} else {
Map<String, Object> individualStatusInfo = new HashMap<>();
individualStatusInfo.put("statusValue",
CrawlDatum.getStatusName((byte) code));
individualStatusInfo.put("count", String.valueOf(val));
statusMap.put(String.valueOf(code), individualStatusInfo);
}
} else {
results.put(k, String.valueOf(val));
}
}
results.put("status", statusMap);
return results;
}
if (type.equalsIgnoreCase("dump")) {
String output = args.get("out_dir");
String format = "normal";
String regex = null;
Integer retry = null;
String status = null;
String expr = null;
Float sample = null;
if (args.containsKey("format")) {
format = args.get("format");
}
if (args.containsKey("regex")) {
regex = args.get("regex");
}
if (args.containsKey("retry")) {
retry = Integer.parseInt(args.get("retry"));
}
if (args.containsKey("status")) {
status = args.get("status");
}
if (args.containsKey("expr")) {
expr = args.get("expr");
}
if (args.containsKey("sample")) {
sample = Float.parseFloat(args.get("sample"));
}
processDumpJob(crawlDb, output, conf, format, regex, status, retry, expr,
sample);
File dumpFile = new File(output + "/part-00000");
return dumpFile;
}
if (type.equalsIgnoreCase("topN")) {
String output = args.get("out_dir");
long topN = Long.parseLong(args.get("nnn"));
float min = 0.0f;
if (args.containsKey("min")) {
min = Float.parseFloat(args.get("min"));
}
processTopNJob(crawlDb, topN, min, output, conf);
File dumpFile = new File(output + "/part-00000");
return dumpFile;
}
if (type.equalsIgnoreCase("url")) {
String url = args.get("url");
CrawlDatum res = get(crawlDb, url, conf);
results.put("status", res.getStatus());
results.put("fetchTime", new Date(res.getFetchTime()));
results.put("modifiedTime", new Date(res.getModifiedTime()));
results.put("retriesSinceFetch", res.getRetriesSinceFetch());
results.put("retryInterval", res.getFetchInterval());
results.put("score", res.getScore());
results.put("signature", StringUtil.toHexString(res.getSignature()));
Map<String, String> metadata = new HashMap<>();
if (res.getMetaData() != null) {
for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
metadata.put(String.valueOf(e.getKey()),
String.valueOf(e.getValue()));
}
}
results.put("metadata", metadata);
return results;
}
return results;
}