public static void main()

in webindex/modules/data/src/main/java/webindex/data/Copy.java [49:105]


  public static void main(String[] args) throws Exception {

    if (args.length != 3) {
      log.error("Usage: Copy <pathsFile> <range> <dest>");
      System.exit(1);
    }
    final String hadoopConfDir = IndexEnv.getHadoopConfDir();
    final List<String> copyList = IndexEnv.getPathsRange(args[0], args[1]);
    if (copyList.isEmpty()) {
      log.error("No files to copy given {} {}", args[0], args[1]);
      System.exit(1);
    }

    WebIndexConfig webIndexConfig = WebIndexConfig.load();

    SparkConf sparkConf = new SparkConf().setAppName("webindex-copy");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

      FileSystem hdfs = FileSystem.get(ctx.hadoopConfiguration());
      Path destPath = new Path(args[2]);
      if (!hdfs.exists(destPath)) {
        hdfs.mkdirs(destPath);
      }

      log.info("Copying {} files (Range {} of paths file {}) from AWS to HDFS {}", copyList.size(),
          args[1], args[0], destPath.toString());

      JavaRDD<String> copyRDD = ctx.parallelize(copyList, webIndexConfig.getNumExecutorInstances());

      final String prefix = WebIndexConfig.CC_URL_PREFIX;
      final String destDir = destPath.toString();

      copyRDD.foreachPartition(iter -> {
        FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
        iter.forEachRemaining(ccPath -> {
          try {
            Path dfsPath = new Path(destDir + "/" + getFilename(ccPath));
            if (fs.exists(dfsPath)) {
              log.error("File {} exists in HDFS and should have been previously filtered",
                  dfsPath.getName());
            } else {
              String urlToCopy = prefix + ccPath;
              log.info("Starting copy of {} to {}", urlToCopy, destDir);
              try (OutputStream out = fs.create(dfsPath);
                  BufferedInputStream in =
                      new BufferedInputStream(new URL(urlToCopy).openStream())) {
                IOUtils.copy(in, out);
              }
              log.info("Created {}", dfsPath.getName());
            }
          } catch (IOException e) {
            log.error("Exception while copying {}", ccPath, e);
          }
        });
      });
    }
  }