public static void main()

in webindex/modules/data/src/main/java/webindex/data/TestParser.java [39:76]


  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      log.error("Usage: TestParser <pathsFile> <range>");
      System.exit(1);
    }
    final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
    if (loadList.isEmpty()) {
      log.error("No files to load given {} {}", args[0], args[1]);
      System.exit(1);
    }

    WebIndexConfig.load();

    SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

      log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
          args[0]);

      JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());

      final String prefix = WebIndexConfig.CC_URL_PREFIX;

      loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
        String urlToCopy = prefix + path;
        log.info("Parsing {}", urlToCopy);
        try {
          ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
          for (ArchiveRecord record : reader) {
            ArchiveUtil.buildPageIgnoreErrors(record);
          }
        } catch (Exception e) {
          log.error("Exception while processing {}", path, e);
        }
      }));
    }
  }