in webindex/modules/data/src/main/java/webindex/data/CalcSplits.java [44:70]
public static void main(String[] args) {
if (args.length != 1) {
log.error("Usage: CalcSplits <dataDir>");
System.exit(1);
}
final String dataDir = args[0];
IndexEnv.validateDataDir(dataDir);
SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
IndexStats stats = new IndexStats(ctx);
final JavaPairRDD<Text, ArchiveReader> archives = ctx.newAPIHadoopFile(dataDir,
WARCFileInputFormat.class, Text.class, ArchiveReader.class, new Configuration());
JavaRDD<Page> pages = IndexUtil.createPages(archives);
JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
JavaPairRDD<RowColumn, Bytes> accumuloIndex =
IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
SortedSet<Text> splits = IndexUtil.calculateSplits(accumuloIndex, 100);
log.info("Accumulo splits:");
splits.forEach(System.out::println);
}
}