in connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java [798:1282]
protected void loginAndFetch(FetchStatus fetchStatus, IProcessActivity activities, String documentIdentifier, SequenceCredentials sessionCredential, String globalSequenceEvent)
throws ManifoldCFException, ServiceInterruption
{
long currentTime = System.currentTimeMillis();
// Here's the maximum number of connections we are going to allow.
int connectionLimit = 200;
String currentURI = documentIdentifier;
// Login pages are special in that I *don't* require them to do a robots check. The reason why is because it is conceivable that a
// site may inadvertantly exclude them via robots, and yet allow content pages to be scanned. This would effectively exclude session login
// for that site if we adhered to the strict policy. Since login pages have to be exclusively identified as being special, explicit
// permission is effectively granted by the user in any case.
// The result code to be activity logging, or null if no activity logging desired.
String activityResultCode = null;
// Form data
FormData formData = null;
while (true)
{
URL url;
try
{
// Do the mapping from the current host name to the IP address
url = new URL(currentURI);
}
catch (MalformedURLException e)
{
// currentURI is malformed.
// If the document was the primary, we should remove it from the queue. But if it's part of a login sequence, we'd better just retry later.
fetchStatus.contextMessage = "was not a valid URL: "+e.getMessage();
fetchStatus.contextException = e;
activityResultCode = "-12";
fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
break;
}
String hostName = url.getHost();
StringBuilder ipAddressBuffer = new StringBuilder();
int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
if (ipAddressStatus == RESULTSTATUS_TRUE)
{
String ipAddress = ipAddressBuffer.toString();
String protocol = url.getProtocol();
int port = url.getPort();
if (port == -1)
port = url.getDefaultPort();
// Try to fetch the document. We'll need its bin names first.
String[] binNames = getBinNames(currentURI);
// Get the credentials for this document (if any)
PageCredentials credential = getPageCredential(currentURI);
IKeystoreManager trustStore;
// Save effort - only bother to get a trust store if this is https
if (protocol.equalsIgnoreCase("https"))
// null return is possible here; indicates "trust everything"
trustStore = getTrustStore(currentURI);
else
trustStore = KeystoreManagerFactory.make("");
// Check robots, if enabled, and if we're fetching the primary document identifier. See comment above.
int robotsStatus = RESULTSTATUS_TRUE;
if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
{
// Passed the robots check!
// Find whatever login parameters apply. This will be null if currentURI is not a login page, and will contain
// interesting information if it is.
LoginCookies lc = null;
if (sessionCredential != null)
{
lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
}
// Prepare to perform the fetch, and decide what to do with the document.
//
IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
throttleGroupName,
protocol,ipAddress,port,
credential,trustStore,throttleDescription,binNames,connectionLimit,
proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword,
socketTimeoutMilliseconds,connectionTimeoutMilliseconds,
activities);
try
{
connection.beginFetch((fetchStatus.sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
try
{
// Execute the fetch!
connection.executeFetch(url.getFile(),userAgent,from,
false,hostName,formData,lc);
int response = connection.getResponseCode();
if (response == 200 || response == 302 || response == 301)
{
// If this was part of the login sequence, update the cookies regardless of what else happens
if (fetchStatus.sessionState == SESSIONSTATE_LOGIN)
{
// Update the cookies
LoginCookies lastFetchCookies = connection.getLastFetchCookies();
cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
}
// Decide whether to exclude this document based on what we see here.
// Basically, we want to get rid of everything that we (a) don't know what
// to do with in the ingestion system, and (b) we can't get useful links from.
String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
if (isContentInteresting(activities,currentURI,response,contentType))
{
// Treat it as real, and cache it.
fetchStatus.checkSum = cache.addData(activities,currentURI,connection);
fetchStatus.headerData = connection.getResponseHeaders();
fetchStatus.resultSignal = RESULT_VERSION_NEEDED;
activityResultCode = null;
}
else
{
fetchStatus.contextMessage = "it had the wrong content type ('"+contentType+"')";
fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
activityResultCode = activities.EXCLUDED_MIMETYPE;
}
}
else
{
// We got some kind of http error code.
// We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
// control of all scheduling around it. Instead, we leave it on the queue and give it an empty version string; that will lead it to be
// reprocessed without fail on the next scheduled check.
// Decode response body to the extent we can
String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
String encoding = extractEncoding(contentType);
if (encoding == null)
encoding = StandardCharsets.UTF_8.name();
String decodedResponse = "undecodable";
try
{
decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
}
catch (ManifoldCFException e)
{
// Eat this exception unless it is an interrupt
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
throw e;
connection.noteInterrupted(e);
}
catch (ServiceInterruption e)
{
// Eat this exception too
connection.noteInterrupted(e);
}
fetchStatus.contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
fetchStatus.resultSignal = RESULT_NO_VERSION;
activityResultCode = null;
}
}
catch (ManifoldCFException e)
{
connection.noteInterrupted(e);
throw e;
}
catch (ServiceInterruption e)
{
connection.noteInterrupted(e);
throw e;
}
finally
{
connection.doneFetch(activities);
}
}
finally
{
connection.close();
}
// State transition logic. If the result indicates a successful fetch so far, we need to decide where to go next.
// This happens AFTER we've released all the connections, because it's conceivable that processing here might be
// significant, and we don't want to tie things up unnecessarily.
String preferredLink = null;
String preferredRedirection = null;
formData = null;
String contentLink = null;
if (fetchStatus.resultSignal == RESULT_VERSION_NEEDED)
{
// If we get here, we know:
// (a) There's a cached version of the page on disk we can read as many times as necessary;
// (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.
// The way we determine if we're in the login sequence for a site is by TWO criteria:
// (1) The URI must match the specified regular expression, and
// (2) The data from that URI must contain the specified form or link information.
// We use the same criteria to look for the exit from a sequence. So, in essence, we're *always* going to need to know whether we're
// officially in the sequence, or not, so we evaluate it always.
boolean isLoginPage = false;
if (sessionCredential != null)
{
Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
boolean seenAnything = false;
boolean seenFormError = false;
boolean seenLinkError = false;
boolean seenRedirectionError = false;
boolean seenContentError = false;
while (iterMatches.hasNext())
{
seenAnything = true;
LoginParameters lp = (LoginParameters)iterMatches.next();
// Note that more than one of the rules may match.
// In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
// If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
// matched.
// Parse the page; it had better match up! Otherwise we get null back.
FormData newFormData = findHTMLForm(currentURI,lp);
if (newFormData != null)
{
if (formData != null)
{
// Oops, more than one matching form rule. Complain.
seenFormError = true;
formData = null;
}
else if (!seenFormError)
{
// A form overrides links, redirection, or content
formData = newFormData;
preferredLink = null;
preferredRedirection = null;
}
}
else
{
// Look for the preferred link instead.
String newPreferredLink = findHTMLLinkURI(currentURI,lp);
if (newPreferredLink != null)
{
if (preferredLink != null)
{
// Oops
seenLinkError = true;
preferredLink = null;
}
else if (!seenLinkError && !seenFormError && formData == null)
{
// Link overrides redirection and content
preferredLink = newPreferredLink;
preferredRedirection = null;
}
}
else
{
// Look for the preferred redirection.
String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
if (newPreferredRedirection != null)
{
if (preferredRedirection != null)
{
seenRedirectionError = true;
preferredRedirection = null;
}
else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
{
preferredRedirection = newPreferredRedirection;
}
}
else
{
// Look for the content in the page. The link returned may be an empty string, if matching content
// is discovered but there is no override. It will be null of the content is not found.
String newContentLink = findSpecifiedContent(currentURI,lp);
if (newContentLink != null)
{
if (contentLink != null)
{
seenContentError = true;
contentLink = null;
}
else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
{
contentLink = newContentLink;
}
}
}
}
}
}
// Now, evaluate all the data and pick the right rule
if (formData != null)
{
// We found the right form! And, we filled it in. So now we enter the "login sequence".
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
isLoginPage = true;
}
else if (preferredLink != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
isLoginPage = true;
}
else if (preferredRedirection != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
isLoginPage = true;
}
else if (contentLink != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
isLoginPage = true;
}
else
{
if (seenAnything && Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
}
}
// Should we do a state transition into the "logging in" state?
if (fetchStatus.sessionState == SESSIONSTATE_NORMAL && isLoginPage)
{
// Entering the login sequence. Make sure we actually can do this...
if (activities.beginEventSequence(globalSequenceEvent))
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");
activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
null,sessionCredential.getSequenceKey(),"OK",null,null);
// Transition to the right state, etc.
fetchStatus.sessionState = SESSIONSTATE_LOGIN;
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");
// Didn't make it in. Retry the main URI when the proper conditions are met.
// We don't want the cached data anymore.
cache.deleteData(currentURI);
fetchStatus.contextMessage = "login sequence already in progress";
fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
activityResultCode = null;
}
}
else if (fetchStatus.sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
{
//== Exit login mode ==
activities.completeEventSequence(globalSequenceEvent);
activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
null,sessionCredential.getSequenceKey(),"OK",null,null);
fetchStatus.sessionState = SESSIONSTATE_NORMAL;
// Make sure we go back and try the original document again, if we happened to have been directed somewhere else
if (!currentURI.equals(documentIdentifier))
{
cache.deleteData(currentURI);
currentURI = documentIdentifier;
continue;
}
// Otherwise, the last fetch stands on its own. Fall through, and allow processing and link extraction
}
// Now, based on the session state and the document contents, decide how to proceed
if (fetchStatus.resultSignal == RESULT_VERSION_NEEDED && fetchStatus.sessionState == SESSIONSTATE_LOGIN)
{
// We are dealing with a login page!
// We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
// This is a bit dicey because there's really
// no good way to *guarantee* that we pick the right one, if there's more than one available.
// What we do is the following:
//
// (a) We look for matching forms. If we found one, we submit it.
// (b) Look for redirections.
// (c) If there are links that vector within the login sequence, we pick one of those preferentially.
// (d) If there are no links that vector within the login sequence, we pick one of the other links.
//
// Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
// While that won't be a problem performance-wise (because everything is appropriately throttled), it
// is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
//
// Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
// which link they want us to pick for a page. Hopefully this would not be necessary.
// Locate the next target URI.
String targetURI;
if (formData != null)
targetURI = formData.getActionURI();
else if (preferredLink != null)
targetURI = preferredLink;
else if (preferredRedirection != null)
targetURI = preferredRedirection;
else /* if (contentLink != null) */
targetURI = contentLink;
// Definitely we don't want the cached data anymore
cache.deleteData(currentURI);
// If the target URI is null, it means we could not find a suitable link. If target URI is "",
// it means that we found a designated logon page but the description did not include a link we
// could chase. Either way, treat this exactly the same
// way as if the link found exited login mode.
if (targetURI == null || targetURI.length() == 0)
{
//== Exiting login mode ==
activities.completeEventSequence(globalSequenceEvent);
activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
null,sessionCredential.getSequenceKey(),"NEXTLINKNOTFOUND","Could not find a usable link to the next page: "+fetchStatus.contextMessage,null);
fetchStatus.sessionState = SESSIONSTATE_NORMAL;
// Make sure we go back and try the original document again, no matter where we got directed to
currentURI = documentIdentifier;
}
else
{
currentURI = targetURI;
}
continue;
}
else if (fetchStatus.resultSignal != RESULT_VERSION_NEEDED && fetchStatus.sessionState == SESSIONSTATE_LOGIN)
{
// The next URL we fetched in the logon sequence turned out to be unsuitable.
// That means that the logon sequence is fundamentally wrong. The session thus ends,
// and of course it will retry, but that's neither here nor there.
//== Exiting login mode ==
activities.completeEventSequence(globalSequenceEvent);
activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
null,sessionCredential.getSequenceKey(),"LINKTARGETUNSUITABLE","Page was unsuitable for a login sequence because: "+fetchStatus.contextMessage,null);
fetchStatus.sessionState = SESSIONSTATE_NORMAL;
// Fall through, leaving everything else alone.
}
}
}
else if (robotsStatus == RESULTSTATUS_FALSE)
{
activityResultCode = "-11";
fetchStatus.contextMessage = "robots.txt says so";
fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
}
else
{
// Robots prerequisite in progress
activityResultCode = null;
fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
fetchStatus.contextMessage = "robots prerequisite already in progress";
}
}
else if (ipAddressStatus == RESULTSTATUS_FALSE)
{
activityResultCode = "-10";
fetchStatus.contextMessage = "ip address not found";
fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
}
else
{
// DNS prerequisite in progress
activityResultCode = null;
fetchStatus.contextMessage = "dns prerequisite already in progress";
fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
}
// If we fail on a document that's not the primary, the result should be to retry the primary later.
if (!currentURI.equals(documentIdentifier))
{
activityResultCode = null;
if (fetchStatus.contextMessage != null)
fetchStatus.contextMessage = "for login sequence url '"+currentURI+"': "+fetchStatus.contextMessage;
if (fetchStatus.resultSignal != RESULT_VERSION_NEEDED)
fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
}
break;
}
// Now, look at the result signal, and set up the version appropriately.
if (activityResultCode != null)
activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,fetchStatus.contextMessage,null);
}