in src/java/org/apache/nutch/crawl/DeduplicationJob.java [170:235]
protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
for (int i = 0; i < compareOrder.length; i++) {
switch (compareOrder[i]) {
case "score":
// compare based on score
if (existingDoc.getScore() < newDoc.getScore()) {
return existingDoc;
} else if (existingDoc.getScore() > newDoc.getScore()) {
// mark new one as duplicate
return newDoc;
}
break;
case "fetchTime":
// same score? delete the one which is oldest
if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
// mark new one as duplicate
return newDoc;
} else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
// mark existing one as duplicate
return existingDoc;
}
break;
case "httpsOverHttp":
// prefer https:// over http:// if URLs are identical except for the
// protocol
String url1 = existingDoc.getMetaData().get(urlKey).toString();
String url2 = newDoc.getMetaData().get(urlKey).toString();
if (url1.startsWith("https://") && url2.startsWith("http://")
&& url1.substring(8).equals(url2.substring(7))) {
// existingDoc with https://, mark newDoc as duplicate
return newDoc;
} else if (url2.startsWith("https://") && url1.startsWith("http://")
&& url2.substring(8).equals(url1.substring(7))) {
// newDoc with https://, mark existingDoc as duplicate
return existingDoc;
}
break;
case "urlLength":
// keep the one which has the shortest URL
// normalized by decoding percent-encoded sequences
String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
try {
urlExisting = URLDecoder.decode(urlExisting, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlExisting, e);
// use the encoded URL
}
try {
urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlnewDoc, e);
// use the encoded URL
}
if (urlExisting.length() < urlnewDoc.length()) {
// mark new one as duplicate
return newDoc;
} else if (urlExisting.length() > urlnewDoc.length()) {
// mark existing one as duplicate
return existingDoc;
}
break;
}
}
return null; // no decision possible
}