in src/java/org/apache/nutch/tools/FileDumper.java [133:298]
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean
flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counts
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration conf = NutchConfiguration.create();
int fileCount = 0;
File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
if (segmentDirs == null) {
LOG.error("No segment directories found in ["
+ segmentRootDir.getAbsolutePath() + "]");
return;
}
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
Map<String, String> filenameToUrl = new HashMap<String, String>();
File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
if (partDirs == null) {
LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
continue;
}
for (File partDir : partDirs) {
try (FileSystem fs = FileSystem.get(conf)) {
String segmentPath = partDir + "/data";
Path file = new Path(segmentPath);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentPath
+ "]: no data directory present");
continue;
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
if (extension == null || (extension != null && extension.equals(""))) {
extension = "html";
}
ByteArrayInputStream bas = null;
Boolean filter = false;
try {
bas = new ByteArrayInputStream(content.getContent());
String mimeType = new Tika().detect(content.getContent());
collectStats(typeCounts, mimeType);
if (mimeType != null) {
if (mimeTypes == null
|| Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn("Tika is unable to detect type for: [" + url + "]");
} finally {
if (bas != null) {
try {
bas.close();
} catch (Exception ignore) {
}
}
}
if (filter) {
if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = outputDir.getAbsolutePath();
if (!flatDir && !reverseURLDump) {
fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
}
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath;
if (reverseURLDump) {
String[] reversedURL = TableUtil.reverseUrl(url).split(":");
reversedURL[0] = reversedURL[0].replace('.', '/');
String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
// We'll drop the trailing file name and create the nested structure if it doesn't already exist.
String[] splitPath = outputFullPath.split("/");
File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
if (!fullOutputDir.exists()) {
if(!fullOutputDir.mkdirs());
throw new Exception("Unable to create: ["
+ fullOutputDir.getAbsolutePath() + "]");
}
} else {
outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
}
filenameToUrl.put(outputFullPath, url);
File outputFile = new File(outputFullPath);
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
// Modified to prevent FileNotFoundException (Invalid Argument)
FileOutputStream output = null;
try {
output = new FileOutputStream(outputFile);
IOUtils.write(content.getContent(), output);
} catch (Exception e) {
LOG.warn("Write Error: [" + outputFullPath + "]");
e.printStackTrace();
} finally {
if (output != null) {
output.flush();
try {
output.close();
} catch (Exception ignore) {
}
}
}
fileCount++;
} else {
LOG.info("Skipping writing: [" + outputFullPath
+ "]: file already exists");
}
}
}
}
}
reader.close();
} finally {
if (doutputStream != null) {
try {
doutputStream.close();
} catch (Exception ignore) {
}
}
}
}
//save filenameToUrl in a json file for each segment there is one mapping file
String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName() );
new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
}
LOG.info("Dumper File Stats: "
+ DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
if (mimeTypeStats) {
System.out.println("Dumper File Stats: "
+ DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}