in src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java [354:443]
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
long startTime = System.currentTimeMillis();
Response response = getResponse(u, datum, false); // make a request
if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
response.getHeader("Content-Type"), response.getHeaders(), this.mimeTypes);
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
} else if (code >= 300 && code < 400) { // handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null)
location = response.getHeader("location");
if (location == null)
location = "";
u = new URL(u, location);
int protocolStatusCode;
switch (code) {
case 300: // multiple choices, preferred value in Location
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 301: // moved permanently
case 305: // use proxy (Location is URL of proxy)
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 302: // found (temporarily moved)
case 303: // see other (redirect after POST)
case 307: // temporary redirect
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case 304: // not modified
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
if (this.logger.isTraceEnabled()) {
this.logger.trace("400 Bad request: {}", u);
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) { // requires authorization, but no valid auth
// provided.
if (this.logger.isTraceEnabled()) {
this.logger.trace("401 Authentication Required");
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.ACCESS_DENIED,
"Authentication required: " + urlString));
} else if (code == 404) {
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
"Http: " + code + " url=" + u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(
ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
if (this.logger.isDebugEnabled() || !this.logUtil.logShort(e)) {
this.logger.error("Failed to get protocol output", e);
} else {
this.logger.error("Failed to get protocol output: {}",
e.getClass().getName());
}
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}