in src/java/org/apache/nutch/crawl/CrawlDb.java [288:377]
public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
Map<String, Object> results = new HashMap<>();
boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
false);
boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
true);
boolean force = false;
HashSet<Path> dirs = new HashSet<>();
if (args.containsKey("normalize")) {
normalize = true;
}
if (args.containsKey("filter")) {
filter = true;
}
if (args.containsKey("force")) {
force = true;
}
if (args.containsKey("noAdditions")) {
additionsAllowed = false;
}
Path crawlDb;
if(args.containsKey(Nutch.ARG_CRAWLDB)) {
Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
if(crawldbPath instanceof Path) {
crawlDb = (Path) crawldbPath;
}
else {
crawlDb = new Path(crawldbPath.toString());
}
}
else {
crawlDb = new Path(crawlId+"/crawldb");
}
Path segmentsDir;
if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
if(segDir instanceof Path) {
segmentsDir = (Path) segDir;
}
else {
segmentsDir = new Path(segDir.toString());
}
FileSystem fs = segmentsDir.getFileSystem(getConf());
FileStatus[] paths = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
}
else if(args.containsKey(Nutch.ARG_SEGMENTS)) {
Object segments = args.get(Nutch.ARG_SEGMENTS);
ArrayList<String> segmentList = new ArrayList<>();
if(segments instanceof ArrayList) {
segmentList = (ArrayList<String>)segments;
}
else if(segments instanceof Path){
segmentList.add(segments.toString());
}
for(String segment: segmentList) {
dirs.add(new Path(segment));
}
}
else {
String segmentDir = crawlId+"/segments";
File dir = new File(segmentDir);
File[] segmentsList = dir.listFiles();
Arrays.sort(segmentsList, (f1, f2) -> {
if(f1.lastModified()>f2.lastModified())
return -1;
else
return 0;
});
dirs.add(new Path(segmentsList[0].getPath()));
}
try {
update(crawlDb, dirs.toArray(new Path[dirs.size()]), normalize,
filter, additionsAllowed, force);
results.put(Nutch.VAL_RESULT, Integer.toString(0));
return results;
} catch (Exception e) {
LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
results.put(Nutch.VAL_RESULT, Integer.toString(-1));
return results;
}
}