in src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java [83:301]
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
this.http = http;
this.url = url;
this.orig = url.toString();
this.base = url.toString();
Scheme scheme = null;
if ("http".equals(url.getProtocol())) {
scheme = Scheme.HTTP;
} else if ("https".equals(url.getProtocol())) {
scheme = Scheme.HTTPS;
} else {
throw new HttpException("Unknown scheme (not http/https) for url:" + url);
}
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetching " + url);
}
String path = "".equals(url.getFile()) ? "/" : url.getFile();
// some servers will redirect a request with a host line like
// "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
// don't want the :80...
String host = url.getHost();
int port;
String portString;
if (url.getPort() == -1) {
if (scheme == Scheme.HTTP) {
port = 80;
} else {
port = 443;
}
portString = "";
} else {
port = url.getPort();
portString = ":" + port;
}
Socket socket = null;
try {
socket = new Socket(); // create the socket
socket.setSoTimeout(http.getTimeout());
// connect
String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
// Optionally skip TLS/SSL certificate validation
SSLSocketFactory factory;
if (http.isTlsCheckCertificates()) {
factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
} else {
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null,
new TrustManager[] { new DummyX509TrustManager(null) }, null);
factory = sslContext.getSocketFactory();
}
SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
Set<String> protocols = new HashSet<String>(
Arrays.asList(sslsocket.getSupportedProtocols()));
Set<String> ciphers = new HashSet<String>(
Arrays.asList(sslsocket.getSupportedCipherSuites()));
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
sslsocket.setEnabledProtocols(
protocols.toArray(new String[protocols.size()]));
sslsocket.setEnabledCipherSuites(
ciphers.toArray(new String[ciphers.size()]));
sslsocket.startHandshake();
socket = sslsocket;
}
this.conf = http.getConf();
if (sockAddr != null
&& conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
OutputStream req = socket.getOutputStream();
StringBuffer reqStr = new StringBuffer("GET ");
if (http.useProxy(url)) {
reqStr.append(url.getProtocol() + "://" + host + portString + path);
} else {
reqStr.append(path);
}
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
reqStr.append(host);
reqStr.append(portString);
reqStr.append("\r\n");
reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
String userAgent = http.getUserAgent();
if ((userAgent == null) || (userAgent.length() == 0)) {
if (Http.LOG.isErrorEnabled()) {
Http.LOG.error("User-agent is not set!");
}
} else {
reqStr.append("User-Agent: ");
reqStr.append(userAgent);
reqStr.append("\r\n");
}
reqStr.append("Accept-Language: ");
reqStr.append(this.http.getAcceptLanguage());
reqStr.append("\r\n");
reqStr.append("Accept: ");
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
reqStr.append("If-Modified-Since: "
+ HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
// store the request in the metadata?
if (conf.getBoolean("store.http.request", false) == true) {
headers.add(Response.REQUEST, reqStr.toString());
}
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
new PushbackInputStream(new BufferedInputStream(
socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
// store the http headers verbatim
if (conf.getBoolean("store.http.headers", false) == true) {
httpHeaders = new StringBuffer();
}
headers.add("nutch.fetch.time",
Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
// parse status code line
this.code = parseStatusLine(in, line);
if (httpHeaders != null)
httpHeaders.append(line).append("\n");
// parse headers
parseHeaders(in, line, httpHeaders);
haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
}
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
// handle with HtmlUnit only if content type in HTML or XHTML
if (contentType != null) {
if (contentType.contains("text/html")
|| contentType.contains("application/xhtml")) {
readContentFromHtmlUnit(url);
} else {
String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
if (transferEncoding != null
&& "chunked".equalsIgnoreCase(transferEncoding.trim())) {
readChunkedContent(in, line);
} else {
readPlainContent(in);
}
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
if ("gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
if (Http.LOG.isTraceEnabled()) {
Http.LOG
.trace("fetched " + content.length + " bytes from " + url);
}
}
}
if (httpHeaders != null) {
headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
}
}
}catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
throw new ProtocolException(e);
} finally {
if (socket != null)
socket.close();
}
}