protected void loginAndFetch()

in connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java [798:1282]
351 lines of code
81 McCabe index (conditional complexity)

  protected void loginAndFetch(FetchStatus fetchStatus, IProcessActivity activities, String documentIdentifier, SequenceCredentials sessionCredential, String globalSequenceEvent)
    throws ManifoldCFException, ServiceInterruption
  {
    long currentTime = System.currentTimeMillis();
    // Here's the maximum number of connections we are going to allow.
    int connectionLimit = 200;

    String currentURI = documentIdentifier;

    // Login pages are special in that I *don't* require them to do a robots check.  The reason why is because it is conceivable that a
    // site may inadvertantly exclude them via robots, and yet allow content pages to be scanned.  This would effectively exclude session login
    // for that site if we adhered to the strict policy.  Since login pages have to be exclusively identified as being special, explicit
    // permission is effectively granted by the user in any case.

    // The result code to be activity logging, or null if no activity logging desired.
    String activityResultCode = null;
    // Form data
    FormData formData = null;
    
    while (true)
    {
      URL url;
      try
      {
        // Do the mapping from the current host name to the IP address
        url = new URL(currentURI);
      }
      catch (MalformedURLException e)
      {
        // currentURI is malformed.
        // If the document was the primary, we should remove it from the queue.  But if it's part of a login sequence, we'd better just retry later.
        fetchStatus.contextMessage = "was not a valid URL: "+e.getMessage();
        fetchStatus.contextException = e;
        activityResultCode = "-12";
        fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
        break;
      }

      String hostName = url.getHost();
      StringBuilder ipAddressBuffer = new StringBuilder();
      int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
      if (ipAddressStatus == RESULTSTATUS_TRUE)
      {
        String ipAddress = ipAddressBuffer.toString();
        String protocol = url.getProtocol();
        int port = url.getPort();
        if (port == -1)
          port = url.getDefaultPort();

        // Try to fetch the document.  We'll need its bin names first.
        String[] binNames = getBinNames(currentURI);

        // Get the credentials for this document (if any)
        PageCredentials credential = getPageCredential(currentURI);
        IKeystoreManager trustStore;
        // Save effort - only bother to get a trust store if this is https
        if (protocol.equalsIgnoreCase("https"))
          // null return is possible here; indicates "trust everything"
          trustStore = getTrustStore(currentURI);
        else
          trustStore = KeystoreManagerFactory.make("");
        // Check robots, if enabled, and if we're fetching the primary document identifier.  See comment above.
        int robotsStatus = RESULTSTATUS_TRUE;
        if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
          url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
        {
          // Passed the robots check!

          // Find whatever login parameters apply.  This will be null if currentURI is not a login page, and will contain
          // interesting information if it is.
          LoginCookies lc = null;
          if (sessionCredential != null)
          {
            lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
          }

          // Prepare to perform the fetch, and decide what to do with the document.
          //
          IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
            throttleGroupName,
            protocol,ipAddress,port,
            credential,trustStore,throttleDescription,binNames,connectionLimit,
            proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword,
            socketTimeoutMilliseconds,connectionTimeoutMilliseconds,
            activities);
          try
          {
            connection.beginFetch((fetchStatus.sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
            try
            {
              // Execute the fetch!
              connection.executeFetch(url.getFile(),userAgent,from,
                false,hostName,formData,lc);
              int response = connection.getResponseCode();

              if (response == 200 || response == 302 || response == 301)
              {
                // If this was part of the login sequence, update the cookies regardless of what else happens
                if (fetchStatus.sessionState == SESSIONSTATE_LOGIN)
                {
                  // Update the cookies
                  LoginCookies lastFetchCookies = connection.getLastFetchCookies();
                  cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
                }

                // Decide whether to exclude this document based on what we see here.
                // Basically, we want to get rid of everything that we (a) don't know what
                // to do with in the ingestion system, and (b) we can't get useful links from.

                String contentType = extractContentType(connection.getResponseHeader("Content-Type"));

                if (isContentInteresting(activities,currentURI,response,contentType))
                {
                  // Treat it as real, and cache it.
                  fetchStatus.checkSum = cache.addData(activities,currentURI,connection);
                  fetchStatus.headerData = connection.getResponseHeaders();
                  fetchStatus.resultSignal = RESULT_VERSION_NEEDED;
                  activityResultCode = null;
                }
                else
                {
                  fetchStatus.contextMessage = "it had the wrong content type ('"+contentType+"')";
                  fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
                  activityResultCode = activities.EXCLUDED_MIMETYPE;
                }
              }
              else
              {
                // We got some kind of http error code.
                // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
                // control of all scheduling around it.  Instead, we leave it on the queue and give it an empty version string; that will lead it to be
                // reprocessed without fail on the next scheduled check.
                // Decode response body to the extent we can
                String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
                String encoding = extractEncoding(contentType);
                if (encoding == null)
                  encoding = StandardCharsets.UTF_8.name();
                String decodedResponse = "undecodable";
                try
                {
                  decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
                }
                catch (ManifoldCFException e)
                {
                  // Eat this exception unless it is an interrupt
                  if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                    throw e;
                  connection.noteInterrupted(e);
                }
                catch (ServiceInterruption e)
                {
                  // Eat this exception too
                  connection.noteInterrupted(e);
                }
                fetchStatus.contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
                fetchStatus.resultSignal = RESULT_NO_VERSION;
                activityResultCode = null;
              }
            }
            catch (ManifoldCFException e)
            {
              connection.noteInterrupted(e);
              throw e;
            }
            catch (ServiceInterruption e)
            {
              connection.noteInterrupted(e);
              throw e;
            }
            finally
            {
              connection.doneFetch(activities);
            }
          }
          finally
          {
            connection.close();
          }

          // State transition logic.  If the result indicates a successful fetch so far, we need to decide where to go next.
          // This happens AFTER we've released all the connections, because it's conceivable that processing here might be
          // significant, and we don't want to tie things up unnecessarily.
          String preferredLink = null;
          String preferredRedirection = null;
          formData = null;
          String contentLink = null;
          if (fetchStatus.resultSignal == RESULT_VERSION_NEEDED)
          {
            // If we get here, we know:
            // (a) There's a cached version of the page on disk we can read as many times as necessary;
            // (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.

            // The way we determine if we're in the login sequence for a site is by TWO criteria:
            // (1) The URI must match the specified regular expression, and
            // (2) The data from that URI must contain the specified form or link information.
            // We use the same criteria to look for the exit from a sequence.  So, in essence, we're *always* going to need to know whether we're
            // officially in the sequence, or not, so we evaluate it always.
            boolean isLoginPage = false;
            if (sessionCredential != null)
            {
              Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
              boolean seenAnything = false;
              boolean seenFormError = false;
              boolean seenLinkError = false;
              boolean seenRedirectionError = false;
              boolean seenContentError = false;
              while (iterMatches.hasNext())
              {
                seenAnything = true;
                LoginParameters lp = (LoginParameters)iterMatches.next();
                // Note that more than one of the rules may match.
                // In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
                // If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
                // matched.

                // Parse the page; it had better match up!  Otherwise we get null back.
                FormData newFormData = findHTMLForm(currentURI,lp);
                if (newFormData != null)
                {
                  if (formData != null)
                  {
                    // Oops, more than one matching form rule.  Complain.
                    seenFormError = true;
                    formData = null;
                  }
                  else if (!seenFormError)
                  {
                    // A form overrides links, redirection, or content
                    formData = newFormData;
                    preferredLink = null;
                    preferredRedirection = null;
                  }
                }
                else
                {
                  // Look for the preferred link instead.
                  String newPreferredLink = findHTMLLinkURI(currentURI,lp);
                  if (newPreferredLink != null)
                  {
                    if (preferredLink != null)
                    {
                      // Oops
                      seenLinkError = true;
                      preferredLink = null;
                    }
                    else if (!seenLinkError && !seenFormError && formData == null)
                    {
                      // Link overrides redirection and content
                      preferredLink = newPreferredLink;
                      preferredRedirection = null;
                    }
                  }
                  else
                  {
                    // Look for the preferred redirection.
                    String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
                    if (newPreferredRedirection != null)
                    {
                      if (preferredRedirection != null)
                      {
                        seenRedirectionError = true;
                        preferredRedirection = null;
                      }
                      else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
                      {
                        preferredRedirection = newPreferredRedirection;
                      }
                    }
                    else
                    {
                      // Look for the content in the page.  The link returned may be an empty string, if matching content
                      // is discovered but there is no override.  It will be null of the content is not found.
                      String newContentLink = findSpecifiedContent(currentURI,lp);
                      if (newContentLink != null)
                      {
                        if (contentLink != null)
                        {
                          seenContentError = true;
                          contentLink = null;
                        }
                        else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
                        {
                          contentLink = newContentLink;
                        }
                      }
                    }
                  }
                }
              }

              // Now, evaluate all the data and pick the right rule
              if (formData != null)
              {
                // We found the right form!  And, we filled it in.  So now we enter the "login sequence".
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
                isLoginPage = true;
              }
              else if (preferredLink != null)
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
                isLoginPage = true;
              }
              else if (preferredRedirection != null)
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
                isLoginPage = true;
              }
              else if (contentLink != null)
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
                isLoginPage = true;
              }
              else
              {
                if (seenAnything && Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
              }
            }

            // Should we do a state transition into the "logging in" state?
            if (fetchStatus.sessionState == SESSIONSTATE_NORMAL && isLoginPage)
            {
              // Entering the login sequence.  Make sure we actually can do this...
              if (activities.beginEventSequence(globalSequenceEvent))
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");

                activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
                  null,sessionCredential.getSequenceKey(),"OK",null,null);

                // Transition to the right state, etc.
                fetchStatus.sessionState = SESSIONSTATE_LOGIN;
              }
              else
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");

                // Didn't make it in.  Retry the main URI when the proper conditions are met.
                // We don't want the cached data anymore.
                cache.deleteData(currentURI);
                fetchStatus.contextMessage = "login sequence already in progress";
                fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
                activityResultCode = null;
              }
            }
            else if (fetchStatus.sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
            {
              //== Exit login mode ==
              activities.completeEventSequence(globalSequenceEvent);
              activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
                null,sessionCredential.getSequenceKey(),"OK",null,null);
              fetchStatus.sessionState = SESSIONSTATE_NORMAL;
              // Make sure we go back and try the original document again, if we happened to have been directed somewhere else
              if (!currentURI.equals(documentIdentifier))
              {
                cache.deleteData(currentURI);
                currentURI = documentIdentifier;
                continue;
              }
              // Otherwise, the last fetch stands on its own.  Fall through, and allow processing and link extraction
            }
              
            // Now, based on the session state and the document contents, decide how to proceed
            if (fetchStatus.resultSignal == RESULT_VERSION_NEEDED && fetchStatus.sessionState == SESSIONSTATE_LOGIN)
            {
              // We are dealing with a login page!

              // We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
              // This is a bit dicey because there's really
              // no good way to *guarantee* that we pick the right one, if there's more than one available.
              // What we do is the following:
              //
              // (a) We look for matching forms.  If we found one, we submit it.
              // (b) Look for redirections.
              // (c) If there are links that vector within the login sequence, we pick one of those preferentially.
              // (d) If there are no links that vector within the login sequence, we pick one of the other links.
              //
              // Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
              // While that won't be a problem performance-wise (because everything is appropriately throttled), it
              // is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
              //
              // Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
              // which link they want us to pick for a page.  Hopefully this would not be necessary.

              // Locate the next target URI.
              String targetURI;
              if (formData != null)
                targetURI = formData.getActionURI();
              else if (preferredLink != null)
                targetURI = preferredLink;
              else if (preferredRedirection != null)
                targetURI = preferredRedirection;
              else /* if (contentLink != null) */
                targetURI = contentLink;

              // Definitely we don't want the cached data anymore
              cache.deleteData(currentURI);

              // If the target URI is null, it means we could not find a suitable link.  If target URI is "",
              // it means that we found a designated logon page but the description did not include a link we
              // could chase.  Either way, treat this exactly the same
              // way as if the link found exited login mode.
              if (targetURI == null || targetURI.length() == 0)
              {
                //== Exiting login mode ==
                activities.completeEventSequence(globalSequenceEvent);
                activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
                  null,sessionCredential.getSequenceKey(),"NEXTLINKNOTFOUND","Could not find a usable link to the next page: "+fetchStatus.contextMessage,null);
                fetchStatus.sessionState = SESSIONSTATE_NORMAL;
                // Make sure we go back and try the original document again, no matter where we got directed to
                currentURI = documentIdentifier;
              }
              else
              {
                currentURI = targetURI;
              }
              continue;
            }
            else if (fetchStatus.resultSignal != RESULT_VERSION_NEEDED && fetchStatus.sessionState == SESSIONSTATE_LOGIN)
            {
              // The next URL we fetched in the logon sequence turned out to be unsuitable.
              // That means that the logon sequence is fundamentally wrong.  The session thus ends,
              // and of course it will retry, but that's neither here nor there.
              //== Exiting login mode ==
              activities.completeEventSequence(globalSequenceEvent);
              activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
                null,sessionCredential.getSequenceKey(),"LINKTARGETUNSUITABLE","Page was unsuitable for a login sequence because: "+fetchStatus.contextMessage,null);
              fetchStatus.sessionState = SESSIONSTATE_NORMAL;
              // Fall through, leaving everything else alone.
            }
          }

        }
        else if (robotsStatus == RESULTSTATUS_FALSE)
        {
          activityResultCode = "-11";
          fetchStatus.contextMessage = "robots.txt says so";
          fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
        }
        else
        {
          // Robots prerequisite in progress
          activityResultCode = null;
          fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
          fetchStatus.contextMessage = "robots prerequisite already in progress";
        }
      }
      else if (ipAddressStatus == RESULTSTATUS_FALSE)
      {
        activityResultCode = "-10";
        fetchStatus.contextMessage = "ip address not found";
        fetchStatus.resultSignal = RESULT_NO_DOCUMENT;
      }
      else
      {
        // DNS prerequisite in progress
        activityResultCode = null;
        fetchStatus.contextMessage = "dns prerequisite already in progress";
        fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
      }
      
      // If we fail on a document that's not the primary, the result should be to retry the primary later.
      if (!currentURI.equals(documentIdentifier))
      {
        activityResultCode = null;
        if (fetchStatus.contextMessage != null)
          fetchStatus.contextMessage = "for login sequence url '"+currentURI+"': "+fetchStatus.contextMessage;
        if (fetchStatus.resultSignal != RESULT_VERSION_NEEDED)
          fetchStatus.resultSignal = RESULT_RETRY_DOCUMENT;
      }

      break;
    }

    // Now, look at the result signal, and set up the version appropriately.
    if (activityResultCode != null)
      activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,fetchStatus.contextMessage,null);
    
  }