in connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java [421:810]
public void executeFetch(String urlPath, String userAgent, String from,
boolean redirectOK, String host, FormData formData,
LoginCookies loginCookies)
throws ManifoldCFException, ServiceInterruption
{
// Set up scheme
SSLConnectionSocketFactory myFactory = new SSLConnectionSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
NoopHostnameVerifier.INSTANCE);
int hostPort;
String displayedPort;
if (port != -1)
{
if (!(protocol.equals("http") && port == 80) &&
!(protocol.equals("https") && port == 443))
{
displayedPort = ":"+Integer.toString(port);
hostPort = port;
}
else
{
displayedPort = "";
hostPort = -1;
}
}
else
{
displayedPort = "";
hostPort = -1;
}
StringBuilder sb = new StringBuilder(protocol);
sb.append("://").append(server).append(displayedPort).append(urlPath);
String fetchUrl = sb.toString();
HttpHost fetchHost = new HttpHost(server,hostPort,protocol);
HttpHost hostHost;
if (host != null)
{
sb.setLength(0);
sb.append(protocol).append("://").append(host).append(displayedPort).append(urlPath);
myUrl = sb.toString();
hostHost = new HttpHost(host,hostPort,protocol);
}
else
{
myUrl = fetchUrl;
hostHost = fetchHost;
}
if (connManager == null)
{
PoolingHttpClientConnectionManager poolingConnManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.getSocketFactory())
.register("https", myFactory)
.build());
poolingConnManager.setDefaultMaxPerRoute(1);
poolingConnManager.setValidateAfterInactivity(2000);
poolingConnManager.setDefaultSocketConfig(SocketConfig.custom()
.setTcpNoDelay(true)
.setSoTimeout(socketTimeoutMilliseconds)
.build());
connManager = poolingConnManager;
}
long startTime = 0L;
if (Logging.connectors.isDebugEnabled())
{
startTime = System.currentTimeMillis();
Logging.connectors.debug("WEB: Waiting for an HttpClient object");
}
CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
// Set up authentication to use
if (authentication != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
credentialsProvider.setCredentials(AuthScope.ANY,
authentication.makeCredentialsObject(host));
}
RequestConfig.Builder requestBuilder = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.setSocketTimeout(socketTimeoutMilliseconds)
.setExpectContinueEnabled(true)
.setConnectTimeout(connectionTimeoutMilliseconds)
.setConnectionRequestTimeout(socketTimeoutMilliseconds)
.setCookieSpec(CookieSpecs.STANDARD)
.setRedirectsEnabled(redirectOK);
// If there's a proxy, set that too.
if (proxyHost != null && proxyHost.length() > 0)
{
// Configure proxy authentication
if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
{
credentialsProvider.setCredentials(
new AuthScope(proxyHost, proxyPort),
new NTCredentials(proxyAuthUsername, (proxyAuthPassword==null)?"":proxyAuthPassword, currentHost, (proxyAuthDomain==null)?"":proxyAuthDomain));
}
HttpHost proxy = new HttpHost(proxyHost, proxyPort);
requestBuilder.setProxy(proxy);
}
httpClient = HttpClients.custom()
.setConnectionManager(connManager)
.disableAutomaticRetries()
.setDefaultRequestConfig(requestBuilder.build())
.setDefaultCredentialsProvider(credentialsProvider)
.setRequestExecutor(new HttpRequestExecutor(socketTimeoutMilliseconds))
.setRedirectStrategy(new LaxRedirectStrategy())
.build();
/*
BasicHttpParams params = new BasicHttpParams();
params.setParameter(ClientPNames.DEFAULT_HOST,fetchHost);
params.setBooleanParameter(CoreConnectionPNames.TCP_NODELAY,true);
params.setBooleanParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK,true);
params.setBooleanParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,true);
// MEDIUM_SECURITY compatibility level not supported in HttpComponents. Try BROWSER_NETSCAPE?
HttpClientParams.setCookiePolicy(params,CookiePolicy.BROWSER_COMPATIBILITY);
params.setBooleanParameter(CookieSpecPNames.SINGLE_COOKIE_HEADER,new Boolean(true));
DefaultHttpClient localHttpClient = new DefaultHttpClient(connManager,params);
// No retries
localHttpClient.setHttpRequestRetryHandler(new HttpRequestRetryHandler()
{
public boolean retryRequest(
IOException exception,
int executionCount,
HttpContext context)
{
return false;
}
});
localHttpClient.setRedirectStrategy(new LaxRedirectStrategy());
localHttpClient.getCookieSpecs().register(CookiePolicy.BROWSER_COMPATIBILITY, new CookieSpecFactory()
{
public CookieSpec newInstance(HttpParams params)
{
return new LaxBrowserCompatSpec();
}
}
);
httpClient = localHttpClient;
*/
// Set the parameters we haven't keyed on (so these can change from request to request)
if (host != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: For "+myUrl+", setting virtual host to "+host);
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Got an HttpClient object after "+new Long(System.currentTimeMillis()-startTime).toString()+" ms.");
startFetchTime = System.currentTimeMillis();
int pageFetchMethod = FormData.SUBMITMETHOD_GET;
if (formData != null)
pageFetchMethod = formData.getSubmitMethod();
switch (pageFetchMethod)
{
case FormData.SUBMITMETHOD_GET:
// MUST be just the path, or apparently we wind up resetting the HostConfiguration
// Add additional parameters to url path
String fullUrlPath;
if (formData != null)
{
StringBuilder psb = new StringBuilder(urlPath);
Iterator iter = formData.getElementIterator();
char appendChar;
if (urlPath.indexOf("?") == -1)
appendChar = '?';
else
appendChar = '&';
while (iter.hasNext())
{
FormDataElement el = (FormDataElement)iter.next();
psb.append(appendChar);
appendChar = '&';
String param = el.getElementName();
String value = el.getElementValue();
psb.append(URLEncoder.encode(param));
if (value != null)
{
psb.append('=').append(URLEncoder.encode(value));
}
}
fullUrlPath = psb.toString();
}
else
{
fullUrlPath = urlPath;
}
// Hack; apparently httpclient treats // as a protocol specifier and so it rips off the first section of the path in that case.
while (fullUrlPath.startsWith("//"))
fullUrlPath = fullUrlPath.substring(1);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Get method for '"+fullUrlPath+"'");
fetchMethod = new HttpGet(fullUrlPath);
break;
case FormData.SUBMITMETHOD_POST:
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Post method for '"+urlPath+"'");
// MUST be just the path, or apparently we wind up resetting the HostConfiguration
HttpPost postMethod = new HttpPost(urlPath);
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
// Add parameters to post variables
if (formData != null)
{
Iterator iter = formData.getElementIterator();
while (iter.hasNext())
{
FormDataElement e = (FormDataElement)iter.next();
String param = e.getElementName();
String value = e.getElementValue();
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Post parameter name '"+param+"' value '"+value+"' for '"+urlPath+"'");
nvps.add(new BasicNameValuePair(param,value));
}
}
postMethod.setEntity(new UrlEncodedFormEntity(nvps, StandardCharsets.UTF_8));
fetchMethod = postMethod;
break;
default:
throw new ManifoldCFException("Illegal method type: "+Integer.toString(pageFetchMethod));
}
// Set all appropriate headers and parameters
fetchMethod.setHeader(new BasicHeader("User-Agent",userAgent));
fetchMethod.setHeader(new BasicHeader("From",from));
fetchMethod.setHeader(new BasicHeader("Accept","*/*"));
fetchMethod.setHeader(new BasicHeader("Accept-Encoding","gzip,deflate"));
// Use a custom cookie store
CookieStore cookieStore = new OurBasicCookieStore();
// If we have any cookies to set, set them.
if (loginCookies != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Adding "+Integer.toString(loginCookies.getCookieCount())+" cookies for '"+urlPath+"'");
int h = 0;
while (h < loginCookies.getCookieCount())
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Cookie '"+loginCookies.getCookie(h)+"' added");
cookieStore.addCookie(loginCookies.getCookie(h++));
}
}
// Copy out the current cookies, in case the fetch fails
lastFetchCookies = loginCookies;
// Create the thread
methodThread = new ExecuteMethodThread(this, fetchThrottler, httpClient, hostHost, fetchMethod, cookieStore);
try
{
methodThread.start();
threadStarted = true;
try
{
statusCode = methodThread.getResponseCode();
lastFetchCookies = methodThread.getCookies();
switch (statusCode)
{
case HttpStatus.SC_REQUEST_TIMEOUT:
case HttpStatus.SC_GATEWAY_TIMEOUT:
case HttpStatus.SC_SERVICE_UNAVAILABLE:
// Temporary service interruption
// May want to make the retry time a parameter someday
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Http response temporary error on '"+myUrl+"': "+Integer.toString(statusCode),new ManifoldCFException("Service unavailable (code "+Integer.toString(statusCode)+")"),
currentTime + TIME_2HRS, currentTime + TIME_1DAY, -1, false);
case HttpStatus.SC_UNAUTHORIZED:
case HttpStatus.SC_USE_PROXY:
case HttpStatus.SC_OK:
case HttpStatus.SC_GONE:
case HttpStatus.SC_NOT_FOUND:
case HttpStatus.SC_BAD_GATEWAY:
case HttpStatus.SC_BAD_REQUEST:
case HttpStatus.SC_FORBIDDEN:
case HttpStatus.SC_INTERNAL_SERVER_ERROR:
default:
return;
}
}
catch (InterruptedException e)
{
methodThread.interrupt();
methodThread = null;
threadStarted = false;
throw e;
}
}
catch (InterruptedException e)
{
// Drop the current connection on the floor, so it cannot be reused.
fetchMethod = null;
throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
statusCode = FETCH_INTERRUPTED;
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (java.net.SocketTimeoutException e)
{
throwable = e;
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
currentTime + TIME_2HRS,-1,false);
}
catch (ConnectTimeoutException e)
{
throwable = e;
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Timed out waiting for connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
currentTime + TIME_2HRS,-1,false);
}
catch (InterruptedIOException e)
{
//Logging.connectors.warn("IO interruption seen",e);
throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
statusCode = FETCH_INTERRUPTED;
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
}
catch (RedirectException e)
{
throwable = e;
statusCode = FETCH_CIRCULAR_REDIRECT;
return;
}
catch (NoHttpResponseException e)
{
throwable = e;
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_15MIN,
currentTime + TIME_2HRS,-1,false);
}
catch (java.net.ConnectException e)
{
throwable = e;
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_2HRS,
currentTime + TIME_6HRS,-1,false);
}
catch (javax.net.ssl.SSLException e)
{
// Probably this is an incorrectly configured trust store
throwable = new ManifoldCFException("SSL handshake error: "+e.getMessage()+"; check your connection's Certificate configuration",e);
statusCode = FETCH_IO_ERROR;
return;
}
catch (IOException e)
{
// Treat this as a bad url. We don't know what happened, but it isn't something we are going to naively
// retry on.
throwable = e;
statusCode = FETCH_IO_ERROR;
return;
}
catch (Throwable e)
{
Logging.connectors.debug("WEB: Caught an unexpected exception: "+e.getMessage(),e);
throwable = e;
statusCode = FETCH_UNKNOWN_ERROR;
return;
}
}