public int run()

in src/java/org/apache/nutch/crawl/CrawlDbReader.java [1033:1151]


  public int run(String[] args) throws IOException, InterruptedException,
      ClassNotFoundException, Exception {
    @SuppressWarnings("resource")
    CrawlDbReader dbr = new CrawlDbReader();

    if (args.length < 2) {
      System.err.println(
          "Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url> | -listen <port>)");
      System.err
          .println("\t<crawldb>\tdirectory name where crawldb is located");
      System.err
          .println("\t-stats [-sort] \tprint overall statistics to System.out");
      System.err.println("\t\t[-sort]\tlist status sorted by host");
      System.err.println(
          "\t-dump <out_dir> [-format normal|csv|crawldb|json]\tdump the whole db to a text file in <out_dir>");
      System.err.println("\t\t[-format csv]\tdump in Csv format");
      System.err.println(
          "\t\t[-format normal]\tdump in standard format (default option)");
      System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
      System.err.println("\t\t[-format json]\tdump in JSON Lines format");
      System.err.println("\t\t[-regex <expr>]\tfilter records with expression");
      System.err.println("\t\t[-retry <num>]\tminimum retry count");
      System.err.println(
          "\t\t[-status <status>]\tfilter records by CrawlDatum status");
      System.err.println(
          "\t\t[-expr <expr>]\tJexl expression to execute for this record");
      System.err.println(
          "\t\t[-sample <fraction>]\tOnly process a random sample with this ratio");
      System.err
          .println("\t-url <url>\tprint information on <url> to System.out");
      System.err
        .println("\t-listen <port> [-keepClientCnxOpen]\tlisten on <port> for URLs and");
      System.err
            .println("\t\t\tsend information about <url> back");
      System.err.println(
          "\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
      System.err
          .println("\t\t[<min>]\tskip records with scores below this value.");
      System.err.println("\t\t\tThis can significantly improve performance.");
      return -1;
    }
    String param = null;
    String crawlDb = args[0];
    this.crawlDb = crawlDb;
    int numConsumed = 0;
    Configuration config = getConf();

    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-stats")) {
        boolean toSort = false;
        if (i < args.length - 1 && "-sort".equals(args[i + 1])) {
          toSort = true;
          i++;
        }
        dbr.processStatJob(crawlDb, config, toSort);
      } else if (args[i].equals("-dump")) {
        param = args[++i];
        String format = "normal";
        String regex = null;
        Integer retry = null;
        String status = null;
        String expr = null;
        Float sample = null;
        for (int j = i + 1; j < args.length; j++) {
          if (args[j].equals("-format")) {
            format = args[++j];
            i = i + 2;
          }
          if (args[j].equals("-regex")) {
            regex = args[++j];
            i = i + 2;
          }
          if (args[j].equals("-retry")) {
            retry = Integer.parseInt(args[++j]);
            i = i + 2;
          }
          if (args[j].equals("-status")) {
            status = args[++j];
            i = i + 2;
          }
          if (args[j].equals("-expr")) {
            expr = args[++j];
            i = i + 2;
          }
          if (args[j].equals("-sample")) {
            sample = Float.parseFloat(args[++j]);
            i = i + 2;
          }
        }
        dbr.processDumpJob(crawlDb, param, config, format, regex, status, retry,
            expr, sample);
      } else if (args[i].equals("-url")) {
        param = args[++i];
        StringBuilder output = new StringBuilder();
        dbr.readUrl(crawlDb, param, config, output);
        System.out.print(output);
      } else if (args[i].equals("-topN")) {
        param = args[++i];
        long topN = Long.parseLong(param);
        param = args[++i];
        float min = 0.0f;
        if (i < args.length - 1) {
          min = Float.parseFloat(args[++i]);
        }
        dbr.processTopNJob(crawlDb, topN, min, param, config);
      } else if ((numConsumed = super.parseArgs(args, i)) > 0) {
        i += numConsumed - 1;
      } else {
        System.err.println("\nError: wrong argument " + args[i]);
        return -1;
      }
    }

    if (numConsumed > 0) {
      // Start listening
      return super.run();
    }
    return 0;
  }