in src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java [159:260]
private ParseStatus output(Context context,
String segmentName, Text key, CrawlDatum datum, Content content,
ProtocolStatus pstatus, int status) throws InterruptedException {
// set the fetch status and the fetch time
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
try {
// parse the content
parseResult = parseUtil.parse(content);
} catch (Exception e) {
LOG.warn("Error parsing: " + key + ": "
+ StringUtils.stringifyException(e));
}
// set the content signature
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(
content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(
content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
try {
context.write(key, new NutchWritable(datum));
context.write(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
if (!parseStatus.isSuccess()) {
LOG.warn("Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(conf);
}
// Calculate page signature.
byte[] signature = SignatureFactory.getSignature(conf)
.calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parse.getData().getContentMeta()
.set(Nutch.SEGMENT_NAME_KEY, segmentName);
parse.getData().getContentMeta()
.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
parse.getData().getContentMeta()
.set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
context.write(url, new NutchWritable(new ParseImpl(new ParseText(
parse.getText()), parse.getData(), parse.isCanonical())));
}
}
} catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("ArcSegmentCreator caught:"
+ StringUtils.stringifyException(e));
}
}
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
return p.getData().getStatus();
}
}
}
return null;
}