in webindex/modules/data/src/main/java/webindex/data/TestParser.java [39:76]
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.error("Usage: TestParser <pathsFile> <range>");
System.exit(1);
}
final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
if (loadList.isEmpty()) {
log.error("No files to load given {} {}", args[0], args[1]);
System.exit(1);
}
WebIndexConfig.load();
SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
args[0]);
JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
final String prefix = WebIndexConfig.CC_URL_PREFIX;
loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
String urlToCopy = prefix + path;
log.info("Parsing {}", urlToCopy);
try {
ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
for (ArchiveRecord record : reader) {
ArchiveUtil.buildPageIgnoreErrors(record);
}
} catch (Exception e) {
log.error("Exception while processing {}", path, e);
}
}));
}
}