protected CrawlDatum getDuplicate()

in src/java/org/apache/nutch/crawl/DeduplicationJob.java [170:235]


    protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
      for (int i = 0; i < compareOrder.length; i++) {
        switch (compareOrder[i]) {
        case "score":
          // compare based on score
          if (existingDoc.getScore() < newDoc.getScore()) {
            return existingDoc;
          } else if (existingDoc.getScore() > newDoc.getScore()) {
            // mark new one as duplicate
            return newDoc;
          }
          break;
        case "fetchTime":
          // same score? delete the one which is oldest
          if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
            // mark new one as duplicate
            return newDoc;
          } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
            // mark existing one as duplicate
            return existingDoc;
          }
          break;
        case "httpsOverHttp":
          // prefer https:// over http:// if URLs are identical except for the
          // protocol
          String url1 = existingDoc.getMetaData().get(urlKey).toString();
          String url2 = newDoc.getMetaData().get(urlKey).toString();
          if (url1.startsWith("https://") && url2.startsWith("http://")
              && url1.substring(8).equals(url2.substring(7))) {
            // existingDoc with https://, mark newDoc as duplicate
            return newDoc;
          } else if (url2.startsWith("https://") && url1.startsWith("http://")
              && url2.substring(8).equals(url1.substring(7))) {
            // newDoc with https://, mark existingDoc as duplicate
            return existingDoc;
          }
          break;
        case "urlLength":
          // keep the one which has the shortest URL
          // normalized by decoding percent-encoded sequences
          String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
          String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
          try {
            urlExisting = URLDecoder.decode(urlExisting, UTF_8);
          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
            LOG.error("Error decoding: {}", urlExisting, e);
            // use the encoded URL
          }
          try {
            urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
            LOG.error("Error decoding: {}", urlnewDoc, e);
            // use the encoded URL
          }
          if (urlExisting.length() < urlnewDoc.length()) {
            // mark new one as duplicate
            return newDoc;
          } else if (urlExisting.length() > urlnewDoc.length()) {
            // mark existing one as duplicate
            return existingDoc;
          }
          break;
        }
      }
      return null; // no decision possible
    }