public static void main()

in webindex/modules/data/src/main/java/webindex/data/LoadS3.java [47:104]


  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      log.error("Usage: LoadS3 <pathsFile> <range>");
      System.exit(1);
    }
    final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
    if (loadList.isEmpty()) {
      log.error("No files to load given {} {}", args[0], args[1]);
      System.exit(1);
    }

    final WebIndexConfig webIndexConfig = WebIndexConfig.load();

    final int rateLimit = webIndexConfig.getLoadRateLimit();
    final String appName = webIndexConfig.fluoApp;

    SparkConf sparkConf = new SparkConf().setAppName("webindex-load-s3");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

      log.info("Loading {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
          args[0]);

      JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());

      final String prefix = WebIndexConfig.CC_URL_PREFIX;

      loadRDD.foreachPartition(iter -> {
        final FluoConfiguration fluoConfig =
            new FluoConfiguration(new File("fluo-conn.properties"));
        fluoConfig.setApplicationName(appName);
        final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
        try (FluoClient client = FluoFactory.newClient(fluoConfig);
            LoaderExecutor le = client.newLoaderExecutor()) {
          iter.forEachRemaining(path -> {
            String urlToCopy = prefix + path;
            log.info("Loading {} to Fluo", urlToCopy);
            try {
              ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
              for (ArchiveRecord record : reader) {
                Page page = ArchiveUtil.buildPageIgnoreErrors(record);
                if (page.getOutboundLinks().size() > 0) {
                  log.info("Loading page {} with {} links", page.getUrl(),
                      page.getOutboundLinks().size());
                  if (rateLimiter != null) {
                    rateLimiter.acquire();
                  }
                  le.execute(PageLoader.updatePage(page));
                }
              }
            } catch (Exception e) {
              log.error("Exception while processing {}", path, e);
            }
          });
        }
      });
    }
  }