public void executeFetch()

in connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java [421:810]


    public void executeFetch(String urlPath, String userAgent, String from,
      boolean redirectOK, String host, FormData formData,
      LoginCookies loginCookies)
      throws ManifoldCFException, ServiceInterruption
    {
      // Set up scheme
      SSLConnectionSocketFactory myFactory = new SSLConnectionSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
        NoopHostnameVerifier.INSTANCE);

      int hostPort;
      String displayedPort;
      if (port != -1)
      {
        if (!(protocol.equals("http") && port == 80) &&
          !(protocol.equals("https") && port == 443))
        {
          displayedPort = ":"+Integer.toString(port);
          hostPort = port;
        }
        else
        {
          displayedPort = "";
          hostPort = -1;
        }
      }
      else
      {
        displayedPort = "";
        hostPort = -1;
      }

      StringBuilder sb = new StringBuilder(protocol);
      sb.append("://").append(server).append(displayedPort).append(urlPath);
      String fetchUrl = sb.toString();

      HttpHost fetchHost = new HttpHost(server,hostPort,protocol);
      HttpHost hostHost;
      
      if (host != null)
      {
        sb.setLength(0);
        sb.append(protocol).append("://").append(host).append(displayedPort).append(urlPath);
        myUrl = sb.toString();
        hostHost = new HttpHost(host,hostPort,protocol);
      }
      else
      {
        myUrl = fetchUrl;
        hostHost = fetchHost;
      }
      
      if (connManager == null)
      {
        PoolingHttpClientConnectionManager poolingConnManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create()
          .register("http", PlainConnectionSocketFactory.getSocketFactory())
          .register("https", myFactory)
          .build());
        poolingConnManager.setDefaultMaxPerRoute(1);
        poolingConnManager.setValidateAfterInactivity(2000);
        poolingConnManager.setDefaultSocketConfig(SocketConfig.custom()
          .setTcpNoDelay(true)
          .setSoTimeout(socketTimeoutMilliseconds)
          .build());
        connManager = poolingConnManager;
      }
      
      long startTime = 0L;
      if (Logging.connectors.isDebugEnabled())
      {
        startTime = System.currentTimeMillis();
        Logging.connectors.debug("WEB: Waiting for an HttpClient object");
      }

      CredentialsProvider credentialsProvider = new BasicCredentialsProvider();

      // Set up authentication to use
      if (authentication != null)
      {
        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
        credentialsProvider.setCredentials(AuthScope.ANY,
          authentication.makeCredentialsObject(host));
      }

      RequestConfig.Builder requestBuilder = RequestConfig.custom()
        .setCircularRedirectsAllowed(true)
        .setSocketTimeout(socketTimeoutMilliseconds)
        .setExpectContinueEnabled(true)
        .setConnectTimeout(connectionTimeoutMilliseconds)
        .setConnectionRequestTimeout(socketTimeoutMilliseconds)
        .setCookieSpec(CookieSpecs.STANDARD)
        .setRedirectsEnabled(redirectOK);

      // If there's a proxy, set that too.
      if (proxyHost != null && proxyHost.length() > 0)
      {
        // Configure proxy authentication
        if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
        {
          credentialsProvider.setCredentials(
            new AuthScope(proxyHost, proxyPort),
            new NTCredentials(proxyAuthUsername, (proxyAuthPassword==null)?"":proxyAuthPassword, currentHost, (proxyAuthDomain==null)?"":proxyAuthDomain));
        }

        HttpHost proxy = new HttpHost(proxyHost, proxyPort);

        requestBuilder.setProxy(proxy);
      }


      httpClient = HttpClients.custom()
        .setConnectionManager(connManager)
        .disableAutomaticRetries()
        .setDefaultRequestConfig(requestBuilder.build())
        .setDefaultCredentialsProvider(credentialsProvider)
        .setRequestExecutor(new HttpRequestExecutor(socketTimeoutMilliseconds))
        .setRedirectStrategy(new LaxRedirectStrategy())
        .build();

        /*
        BasicHttpParams params = new BasicHttpParams();
        params.setParameter(ClientPNames.DEFAULT_HOST,fetchHost);
        params.setBooleanParameter(CoreConnectionPNames.TCP_NODELAY,true);
        params.setBooleanParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK,true);
        params.setBooleanParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,true);
        // MEDIUM_SECURITY compatibility level not supported in HttpComponents.  Try BROWSER_NETSCAPE?
        HttpClientParams.setCookiePolicy(params,CookiePolicy.BROWSER_COMPATIBILITY);
        params.setBooleanParameter(CookieSpecPNames.SINGLE_COOKIE_HEADER,new Boolean(true));

        DefaultHttpClient localHttpClient = new DefaultHttpClient(connManager,params);
        // No retries
        localHttpClient.setHttpRequestRetryHandler(new HttpRequestRetryHandler()
          {
            public boolean retryRequest(
              IOException exception,
              int executionCount,
              HttpContext context)
            {
              return false;
            }
         
          });
        localHttpClient.setRedirectStrategy(new LaxRedirectStrategy());
        localHttpClient.getCookieSpecs().register(CookiePolicy.BROWSER_COMPATIBILITY, new CookieSpecFactory()
          {

            public CookieSpec newInstance(HttpParams params)
            {
              return new LaxBrowserCompatSpec();
            }
    
          }
        );


          
        httpClient = localHttpClient;
        */


      // Set the parameters we haven't keyed on (so these can change from request to request)

      if (host != null)
      {
        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("WEB: For "+myUrl+", setting virtual host to "+host);
      }


      if (Logging.connectors.isDebugEnabled())
        Logging.connectors.debug("WEB: Got an HttpClient object after "+new Long(System.currentTimeMillis()-startTime).toString()+" ms.");

      startFetchTime = System.currentTimeMillis();

      int pageFetchMethod = FormData.SUBMITMETHOD_GET;
      if (formData != null)
        pageFetchMethod = formData.getSubmitMethod();
      switch (pageFetchMethod)
      {
      case FormData.SUBMITMETHOD_GET:
        // MUST be just the path, or apparently we wind up resetting the HostConfiguration
        // Add additional parameters to url path
        String fullUrlPath;
        if (formData != null)
        {
          StringBuilder psb = new StringBuilder(urlPath);
          Iterator iter = formData.getElementIterator();
          char appendChar;
          if (urlPath.indexOf("?") == -1)
            appendChar = '?';
          else
            appendChar = '&';

            while (iter.hasNext())
            {
              FormDataElement el = (FormDataElement)iter.next();
              psb.append(appendChar);
              appendChar = '&';
              String param = el.getElementName();
              String value = el.getElementValue();
              psb.append(URLEncoder.encode(param));
              if (value != null)
              {
                psb.append('=').append(URLEncoder.encode(value));
              }
            }


          fullUrlPath = psb.toString();
        }
        else
        {
          fullUrlPath = urlPath;
        }
        // Hack; apparently httpclient treats // as a protocol specifier and so it rips off the first section of the path in that case.
        while (fullUrlPath.startsWith("//"))
          fullUrlPath = fullUrlPath.substring(1);
        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("WEB: Get method for '"+fullUrlPath+"'");
        fetchMethod = new HttpGet(fullUrlPath);
        break;
      case FormData.SUBMITMETHOD_POST:
        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("WEB: Post method for '"+urlPath+"'");
        // MUST be just the path, or apparently we wind up resetting the HostConfiguration
        HttpPost postMethod = new HttpPost(urlPath);
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();

        // Add parameters to post variables
        if (formData != null)
        {
          Iterator iter = formData.getElementIterator();
          while (iter.hasNext())
          {
            FormDataElement e = (FormDataElement)iter.next();
            String param = e.getElementName();
            String value = e.getElementValue();
            if (Logging.connectors.isDebugEnabled())
              Logging.connectors.debug("WEB: Post parameter name '"+param+"' value '"+value+"' for '"+urlPath+"'");
            nvps.add(new BasicNameValuePair(param,value));
          }
        }
        postMethod.setEntity(new UrlEncodedFormEntity(nvps, StandardCharsets.UTF_8));
        fetchMethod = postMethod;
        break;
      default:
        throw new ManifoldCFException("Illegal method type: "+Integer.toString(pageFetchMethod));
      }

      // Set all appropriate headers and parameters
      fetchMethod.setHeader(new BasicHeader("User-Agent",userAgent));
      fetchMethod.setHeader(new BasicHeader("From",from));
      fetchMethod.setHeader(new BasicHeader("Accept","*/*"));
      fetchMethod.setHeader(new BasicHeader("Accept-Encoding","gzip,deflate"));

      // Use a custom cookie store
      CookieStore cookieStore = new OurBasicCookieStore();
      // If we have any cookies to set, set them.
      if (loginCookies != null)
      {
        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("WEB: Adding "+Integer.toString(loginCookies.getCookieCount())+" cookies for '"+urlPath+"'");
        int h = 0;
        while (h < loginCookies.getCookieCount())
        {
          if (Logging.connectors.isDebugEnabled())
            Logging.connectors.debug("WEB:  Cookie '"+loginCookies.getCookie(h)+"' added");
          cookieStore.addCookie(loginCookies.getCookie(h++));
        }
      }


      // Copy out the current cookies, in case the fetch fails
      lastFetchCookies = loginCookies;

      // Create the thread
      methodThread = new ExecuteMethodThread(this, fetchThrottler, httpClient, hostHost, fetchMethod, cookieStore);
      try
      {
        methodThread.start();
        threadStarted = true;
        try
        {
          statusCode = methodThread.getResponseCode();
          lastFetchCookies = methodThread.getCookies();
          switch (statusCode)
          {
          case HttpStatus.SC_REQUEST_TIMEOUT:
          case HttpStatus.SC_GATEWAY_TIMEOUT:
          case HttpStatus.SC_SERVICE_UNAVAILABLE:
            // Temporary service interruption
            // May want to make the retry time a parameter someday
            long currentTime = System.currentTimeMillis();
            throw new ServiceInterruption("Http response temporary error on '"+myUrl+"': "+Integer.toString(statusCode),new ManifoldCFException("Service unavailable (code "+Integer.toString(statusCode)+")"),
              currentTime + TIME_2HRS, currentTime + TIME_1DAY, -1, false);
          case HttpStatus.SC_UNAUTHORIZED:
          case HttpStatus.SC_USE_PROXY:
          case HttpStatus.SC_OK:
          case HttpStatus.SC_GONE:
          case HttpStatus.SC_NOT_FOUND:
          case HttpStatus.SC_BAD_GATEWAY:
          case HttpStatus.SC_BAD_REQUEST:
          case HttpStatus.SC_FORBIDDEN:
          case HttpStatus.SC_INTERNAL_SERVER_ERROR:
          default:
            return;
          }
        }
        catch (InterruptedException e)
        {
          methodThread.interrupt();
          methodThread = null;
          threadStarted = false;
          throw e;
        }

      }
      catch (InterruptedException e)
      {
        // Drop the current connection on the floor, so it cannot be reused.
        fetchMethod = null;
        throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
        statusCode = FETCH_INTERRUPTED;
        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
      }
      catch (java.net.SocketTimeoutException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
          currentTime + TIME_2HRS,-1,false);
      }
      catch (ConnectTimeoutException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
          currentTime + TIME_2HRS,-1,false);
      }
      catch (InterruptedIOException e)
      {
        //Logging.connectors.warn("IO interruption seen",e);
        throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
        statusCode = FETCH_INTERRUPTED;
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      catch (RedirectException e)
      {
        throwable = e;
        statusCode = FETCH_CIRCULAR_REDIRECT;
        return;
      }
      catch (NoHttpResponseException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_15MIN,
          currentTime + TIME_2HRS,-1,false);
      }
      catch (java.net.ConnectException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_2HRS,
          currentTime + TIME_6HRS,-1,false);
      }
      catch (javax.net.ssl.SSLException e)
      {
        // Probably this is an incorrectly configured trust store
        throwable = new ManifoldCFException("SSL handshake error: "+e.getMessage()+"; check your connection's Certificate configuration",e);
        statusCode = FETCH_IO_ERROR;
        return;
      }
      catch (IOException e)
      {
        // Treat this as a bad url.  We don't know what happened, but it isn't something we are going to naively
        // retry on.
        throwable = e;
        statusCode = FETCH_IO_ERROR;
        return;
      }
      catch (Throwable e)
      {
        Logging.connectors.debug("WEB: Caught an unexpected exception: "+e.getMessage(),e);
        throwable = e;
        statusCode = FETCH_UNKNOWN_ERROR;
        return;
      }

    }