in core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java [157:256]
public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Exception {
LOG.debug("HTTP connection manager stats {}", CONNECTION_MANAGER.getTotalStats());
// set default request config to global config
RequestConfig reqConfig = requestConfig;
// conditionally add a dynamic proxy
if (proxyManager != null) {
// retrieve proxy from proxy manager
SCProxy prox = proxyManager.getProxy(md);
// conditionally configure proxy authentication
if (StringUtils.isNotBlank(prox.getUsername())) {
List<String> authSchemes = new ArrayList<>();
// Can make configurable and add more in future
authSchemes.add(AuthSchemes.BASIC);
requestConfigBuilder.setProxyPreferredAuthSchemes(authSchemes);
BasicCredentialsProvider basicAuthCreds = new BasicCredentialsProvider();
basicAuthCreds.setCredentials(
new AuthScope(prox.getAddress(), Integer.parseInt(prox.getPort())),
new UsernamePasswordCredentials(prox.getUsername(), prox.getPassword()));
builder.setDefaultCredentialsProvider(basicAuthCreds);
}
HttpHost proxy = new HttpHost(prox.getAddress(), Integer.parseInt(prox.getPort()));
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
builder.setRoutePlanner(routePlanner);
// save start time for debugging speed impact of request config
// build
long buildStart = System.currentTimeMillis();
// set request config to new configuration with dynamic proxy
reqConfig = requestConfigBuilder.build();
LOG.debug(
"time to build http request config with proxy: {}ms",
System.currentTimeMillis() - buildStart);
LOG.debug("fetching with " + prox.toString());
}
HttpRequestBase request = new HttpGet(url);
ResponseHandler<ProtocolResponse> responseHandler = this;
if (md != null) {
addHeadersToRequest(request, md);
String useHead = md.getFirstValue("http.method.head");
if (Boolean.parseBoolean(useHead)) {
request = new HttpHead(url);
}
String lastModified = md.getFirstValue(HttpHeaders.LAST_MODIFIED);
if (StringUtils.isNotBlank(lastModified)) {
request.addHeader("If-Modified-Since", HttpHeaders.formatHttpDate(lastModified));
}
String ifNoneMatch = md.getFirstValue("etag", protocolMDprefix);
if (StringUtils.isNotBlank(ifNoneMatch)) {
request.addHeader("If-None-Match", ifNoneMatch);
}
String accept = md.getFirstValue("http.accept");
if (StringUtils.isNotBlank(accept)) {
request.setHeader(new BasicHeader("Accept", accept));
}
String acceptLanguage = md.getFirstValue("http.accept.language");
if (StringUtils.isNotBlank(acceptLanguage)) {
request.setHeader(new BasicHeader("Accept-Language", acceptLanguage));
}
String pageMaxContentStr = md.getFirstValue("http.content.limit");
if (StringUtils.isNotBlank(pageMaxContentStr)) {
try {
int pageMaxContent = Integer.parseInt(pageMaxContentStr);
responseHandler = getResponseHandlerWithContentLimit(pageMaxContent);
} catch (NumberFormatException e) {
LOG.warn("Invalid http.content.limit in metadata: {}", pageMaxContentStr);
}
}
if (useCookies) {
addCookiesToRequest(request, md);
}
}
request.setConfig(reqConfig);
// no need to release the connection explicitly as this is handled
// automatically. The client itself must be closed though.
try (CloseableHttpClient httpclient = builder.build()) {
return httpclient.execute(request, responseHandler);
}
}