in webindex/modules/data/src/main/java/webindex/data/Init.java [40:72]
public static void main(String[] args) throws Exception {
if (args.length > 1) {
log.error("Usage: Init [<dataDir>]");
System.exit(1);
}
WebIndexConfig webIndexConfig = WebIndexConfig.load();
IndexEnv env = new IndexEnv(webIndexConfig);
env.setFluoTableSplits();
log.info("Initialized Fluo table splits");
if (args.length == 1) {
final String dataDir = args[0];
IndexEnv.validateDataDir(dataDir);
SparkConf sparkConf = new SparkConf().setAppName("webindex-init");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
IndexStats stats = new IndexStats(ctx);
final JavaPairRDD<Text, ArchiveReader> archives = ctx.newAPIHadoopFile(dataDir,
WARCFileInputFormat.class, Text.class, ArchiveReader.class, new Configuration());
JavaRDD<Page> pages = IndexUtil.createPages(archives);
env.initializeIndexes(ctx, pages, stats);
stats.print();
}
} else {
log.info("An init data dir was not specified");
}
}