public void reduce()

in src/java/org/apache/nutch/scoring/webgraph/WebGraph.java [346:410]


      public void reduce(Text key, Iterable<NutchWritable> values,
          Context context)
          throws IOException, InterruptedException {

        // aggregate all outlinks, get the most recent timestamp for a fetch
        // which should be the timestamp for all of the most recent outlinks
        long mostRecent = 0L;
        List<LinkDatum> outlinkList = new ArrayList<>();
        for (NutchWritable val : values) {
          final Writable value = val.get();

          if (value instanceof LinkDatum) {
            // loop through, change out most recent timestamp if needed
            LinkDatum next = (LinkDatum) value;
            long timestamp = next.getTimestamp();
            if (mostRecent == 0L || mostRecent < timestamp) {
              mostRecent = timestamp;
            }
            outlinkList.add(WritableUtils.clone(next, conf));
            context.getCounter("WebGraph.outlinks", "added links").increment(1);
          } else if (value instanceof BooleanWritable) {
            BooleanWritable delete = (BooleanWritable) value;
            // Actually, delete is always true, otherwise we don't emit it in the
            // mapper in the first place
            if (delete.get() == true) {
              // This page is gone, do not emit it's outlinks
              context.getCounter("WebGraph.outlinks", "removed links").increment(1);
              return;
            }
          }
        }

        // get the url, domain, and host for the url
        String url = key.toString();
        String domain = URLUtil.getDomainName(url);
        String host = URLUtil.getHost(url);

        // setup checking sets for domains and pages
        Set<String> domains = new HashSet<>();
        Set<String> pages = new HashSet<>();

        // loop through the link datums
        for (LinkDatum datum : outlinkList) {

          // get the url, host, domain, and page for each outlink
          String toUrl = datum.getUrl();
          String toDomain = URLUtil.getDomainName(toUrl);
          String toHost = URLUtil.getHost(toUrl);
          String toPage = URLUtil.getPage(toUrl);
          datum.setLinkType(LinkDatum.OUTLINK);

          // outlinks must be the most recent and conform to internal url and
          // limiting rules, if it does collect it
          if (datum.getTimestamp() == mostRecent
              && (!limitPages || (limitPages && !pages.contains(toPage)))
              && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
              && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
              && (!ignoreDomain || (ignoreDomain && !toDomain
                  .equalsIgnoreCase(domain)))) {
            context.write(key, datum);
            pages.add(toPage);
            domains.add(toDomain);
          }
        }
      }