public static void main()

in webindex/modules/data/src/main/java/webindex/data/CalcSplits.java [44:70]


  public static void main(String[] args) {
    if (args.length != 1) {
      log.error("Usage: CalcSplits <dataDir>");
      System.exit(1);
    }
    final String dataDir = args[0];
    IndexEnv.validateDataDir(dataDir);

    SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

      IndexStats stats = new IndexStats(ctx);

      final JavaPairRDD<Text, ArchiveReader> archives = ctx.newAPIHadoopFile(dataDir,
          WARCFileInputFormat.class, Text.class, ArchiveReader.class, new Configuration());

      JavaRDD<Page> pages = IndexUtil.createPages(archives);

      JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
      JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
      JavaPairRDD<RowColumn, Bytes> accumuloIndex =
          IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
      SortedSet<Text> splits = IndexUtil.calculateSplits(accumuloIndex, 100);
      log.info("Accumulo splits:");
      splits.forEach(System.out::println);
    }
  }