public void processStatJob()

in src/java/org/apache/nutch/crawl/CrawlDbReader.java [700:800]


  public void processStatJob(String crawlDb, Configuration config, boolean sort)
      throws IOException, InterruptedException, ClassNotFoundException {

    double quantiles[] = { .01, .05, .1, .2, .25, .3, .4, .5, .6, .7, .75, .8,
        .9, .95, .99 };
    if (config.get("db.stats.score.quantiles") != null) {
      List<Double> qs = new ArrayList<>();
      for (String s : config.getStrings("db.stats.score.quantiles")) {
        try {
          double d = Double.parseDouble(s);
          if (d >= 0.0 && d <= 1.0) {
            qs.add(d);
          } else {
            LOG.warn(
                "Skipping quantile {} not in range in db.stats.score.quantiles",
                s);
          }
        } catch (NumberFormatException e) {
          LOG.warn(
              "Skipping bad floating point number {} in db.stats.score.quantiles: {}",
              s, e.getMessage());
        }
        quantiles = new double[qs.size()];
        int i = 0;
        for (Double q : qs) {
          quantiles[i++] = q;
        }
        Arrays.sort(quantiles);
      }
    }

    LOG.info("CrawlDb statistics start: {}", crawlDb);
    TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, config,
        sort);

    if (LOG.isInfoEnabled()) {
      LOG.info("Statistics for CrawlDb: {}", crawlDb);
      LongWritable totalCnt = new LongWritable(0);
      if (stats.containsKey("T")) {
        totalCnt = ((LongWritable) stats.get("T"));
        stats.remove("T");
      }
      LOG.info("TOTAL urls:\t" + totalCnt.get());
      for (Map.Entry<String, Writable> entry : stats.entrySet()) {
        String k = entry.getKey();
        long value = 0;
        double fvalue = 0.0;
        byte[] bytesValue = null;
        Writable val = entry.getValue();
        if (val instanceof LongWritable) {
          value = ((LongWritable) val).get();
        } else if (val instanceof FloatWritable) {
          fvalue = ((FloatWritable) val).get();
        } else if (val instanceof BytesWritable) {
          bytesValue = ((BytesWritable) val).getBytes();
        }
        if (k.equals("scn")) {
          LOG.info("min score:\t" + fvalue);
        } else if (k.equals("scx")) {
          LOG.info("max score:\t" + fvalue);
        } else if (k.equals("sct")) {
          LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
        } else if (k.equals("scNaN")) {
          LOG.info("score == NaN:\t" + value);
        } else if (k.equals("ftn")) {
          LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
        } else if (k.equals("ftx")) {
          LOG.info("latest fetch time:\t" + new Date(1000 * 60 * value));
        } else if (k.equals("ftt")) {
          LOG.info("avg of fetch times:\t"
              + new Date(1000 * 60 * (value / totalCnt.get())));
        } else if (k.equals("fin")) {
          LOG.info("shortest fetch interval:\t{}",
              TimingUtil.secondsToDaysHMS(value));
        } else if (k.equals("fix")) {
          LOG.info("longest fetch interval:\t{}",
              TimingUtil.secondsToDaysHMS(value));
        } else if (k.equals("fit")) {
          LOG.info("avg fetch interval:\t{}",
              TimingUtil.secondsToDaysHMS(value / totalCnt.get()));
        } else if (k.startsWith("status")) {
          String[] st = k.split(" ");
          int code = Integer.parseInt(st[1]);
          if (st.length > 2)
            LOG.info("   " + st[2] + " :\t" + val);
          else
            LOG.info(st[0] + " " + code + " ("
                + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
        } else if (k.equals("scd")) {
          MergingDigest tdigest = MergingDigest
              .fromBytes(ByteBuffer.wrap(bytesValue));
          for (double q : quantiles) {
            LOG.info("score quantile {}:\t{}", q, tdigest.quantile(q));
          }
        } else {
          LOG.info(k + ":\t" + val);
        }
      }
    }
    LOG.info("CrawlDb statistics: done");
  }