in src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java [76:343]
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
this.http = http;
this.url = url;
Scheme scheme = null;
if ("http".equals(url.getProtocol())) {
scheme = Scheme.HTTP;
} else if ("https".equals(url.getProtocol())) {
scheme = Scheme.HTTPS;
} else {
throw new HttpException("Unknown scheme (not http/https) for url:" + url);
}
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetching " + url);
}
String path = url.getFile();
if (!path.startsWith("/")) {
path = "/" + path;
}
// some servers will redirect a request with a host line like
// "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
// don't want the :80...
String host = url.getHost();
int port;
String portString;
if (url.getPort() == -1) {
if (scheme == Scheme.HTTP) {
port = 80;
} else {
port = 443;
}
portString = "";
} else {
port = url.getPort();
portString = ":" + port;
}
Socket socket = null;
try {
socket = new Socket(); // create the socket
socket.setSoTimeout(http.getTimeout());
// connect
String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
SSLSocket sslsocket = null;
try {
sslsocket = getSSLSocket(socket, sockHost, sockPort);
sslsocket.startHandshake();
} catch (Exception e) {
Http.LOG.debug("SSL connection to {} failed with: {}", url,
e.getMessage());
if ("handshake alert: unrecognized_name".equals(e.getMessage())) {
try {
// Reconnect, see NUTCH-2447
socket = new Socket();
socket.setSoTimeout(http.getTimeout());
socket.connect(sockAddr, http.getTimeout());
sslsocket = getSSLSocket(socket, "", sockPort);
sslsocket.startHandshake();
} catch (Exception ex) {
String msg = "SSL reconnect to " + url + " failed with: "
+ e.getMessage();
throw new HttpException(msg);
}
} else {
String msg = "SSL connect to " + url + " failed with: "
+ e.getMessage();
throw new HttpException(msg, e);
}
}
socket = sslsocket;
}
if (http.isStoreIPAddress()) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
OutputStream req = socket.getOutputStream();
StringBuffer reqStr = new StringBuffer("GET ");
if (http.useProxy(url)) {
reqStr.append(url.getProtocol() + "://" + host + portString + path);
} else {
reqStr.append(path);
}
if (http.getUseHttp11()) {
reqStr.append(" HTTP/1.1\r\n");
} else {
reqStr.append(" HTTP/1.0\r\n");
}
reqStr.append("Host: ");
reqStr.append(host);
reqStr.append(portString);
reqStr.append("\r\n");
reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
String userAgent = http.getUserAgent();
if ((userAgent == null) || (userAgent.length() == 0)) {
if (Http.LOG.isErrorEnabled()) {
Http.LOG.error("User-agent is not set!");
}
} else {
reqStr.append("User-Agent: ");
reqStr.append(userAgent);
reqStr.append("\r\n");
}
String acceptLanguage = http.getAcceptLanguage();
if (!acceptLanguage.isEmpty()) {
reqStr.append("Accept-Language: ");
reqStr.append(acceptLanguage);
reqStr.append("\r\n");
}
String acceptCharset = http.getAcceptCharset();
if (!acceptCharset.isEmpty()) {
reqStr.append("Accept-Charset: ");
reqStr.append(acceptCharset);
reqStr.append("\r\n");
}
String accept = http.getAccept();
if (!accept.isEmpty()) {
reqStr.append("Accept: ");
reqStr.append(accept);
reqStr.append("\r\n");
}
if (http.isCookieEnabled()) {
String cookie = null;
if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
}
if (cookie == null) {
cookie = http.getCookie(url);
}
if (cookie != null) {
reqStr.append("Cookie: ");
reqStr.append(cookie);
reqStr.append("\r\n");
}
}
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
reqStr.append(HttpHeaders.IF_MODIFIED_SINCE + ": "
+ HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
// "signal that this connection will be closed after completion of the
// response", see https://tools.ietf.org/html/rfc7230#section-6.1
reqStr.append("Connection: close\r\n");
reqStr.append("\r\n");
// store the request in the metadata?
if (http.isStoreHttpRequest()) {
headers.add(Response.REQUEST, reqStr.toString());
}
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
new PushbackInputStream(
new BufferedInputStream(socket.getInputStream(),
Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
StringBuffer lineSeparator = new StringBuffer();
// store the http headers verbatim
if (http.isStoreHttpHeaders()) {
httpHeaders = new StringBuffer();
}
headers.add(FETCH_TIME, Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
// parse status code line
try {
this.code = parseStatusLine(in, line, lineSeparator);
} catch(HttpException e) {
Http.LOG.warn("Missing or invalid HTTP status line", e);
Http.LOG.warn("No HTTP header, assuming HTTP/0.9 for {}", getUrl());
this.code = 200;
in.unread(lineSeparator.toString().getBytes(StandardCharsets.ISO_8859_1));
in.unread(line.toString().getBytes(StandardCharsets.ISO_8859_1));
break;
}
if (httpHeaders != null)
httpHeaders.append(line).append("\r\n");
// parse headers
parseHeaders(in, line, httpHeaders);
haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
}
try {
String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
if (transferEncoding != null
&& "chunked".equalsIgnoreCase(transferEncoding.trim())) {
readChunkedContent(in, line);
} else {
readPlainContent(in);
}
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
if ("gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetched " + content.length + " bytes from " + url);
}
}
if (httpHeaders != null) {
httpHeaders.append("\r\n");
headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
}
} catch (IOException | HttpException e) {
// Headers parsing went fine, but an error occurred while trying to read
// the body of the request (the body may be malformed)
if (code != 200) {
Http.LOG.warn(
"Ignored exception while reading payload of response with status code "
+ code + ":",
e);
content = null;
if (httpHeaders != null) {
httpHeaders.append("\r\n");
headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
}
} else {
// If the page is a "200 OK" response, we do not want to go further
// with processing the invalid payload.
throw e;
}
}
} finally {
if (socket != null)
socket.close();
}
}