in webindex/modules/data/src/main/java/webindex/data/Copy.java [49:105]
public static void main(String[] args) throws Exception {
if (args.length != 3) {
log.error("Usage: Copy <pathsFile> <range> <dest>");
System.exit(1);
}
final String hadoopConfDir = IndexEnv.getHadoopConfDir();
final List<String> copyList = IndexEnv.getPathsRange(args[0], args[1]);
if (copyList.isEmpty()) {
log.error("No files to copy given {} {}", args[0], args[1]);
System.exit(1);
}
WebIndexConfig webIndexConfig = WebIndexConfig.load();
SparkConf sparkConf = new SparkConf().setAppName("webindex-copy");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
FileSystem hdfs = FileSystem.get(ctx.hadoopConfiguration());
Path destPath = new Path(args[2]);
if (!hdfs.exists(destPath)) {
hdfs.mkdirs(destPath);
}
log.info("Copying {} files (Range {} of paths file {}) from AWS to HDFS {}", copyList.size(),
args[1], args[0], destPath.toString());
JavaRDD<String> copyRDD = ctx.parallelize(copyList, webIndexConfig.getNumExecutorInstances());
final String prefix = WebIndexConfig.CC_URL_PREFIX;
final String destDir = destPath.toString();
copyRDD.foreachPartition(iter -> {
FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
iter.forEachRemaining(ccPath -> {
try {
Path dfsPath = new Path(destDir + "/" + getFilename(ccPath));
if (fs.exists(dfsPath)) {
log.error("File {} exists in HDFS and should have been previously filtered",
dfsPath.getName());
} else {
String urlToCopy = prefix + ccPath;
log.info("Starting copy of {} to {}", urlToCopy, destDir);
try (OutputStream out = fs.create(dfsPath);
BufferedInputStream in =
new BufferedInputStream(new URL(urlToCopy).openStream())) {
IOUtils.copy(in, out);
}
log.info("Created {}", dfsPath.getName());
}
} catch (IOException e) {
log.error("Exception while copying {}", ccPath, e);
}
});
});
}
}