public HttpResponse()

in src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java [76:343]


  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
      throws ProtocolException, IOException {

    this.http = http;
    this.url = url;

    Scheme scheme = null;

    if ("http".equals(url.getProtocol())) {
      scheme = Scheme.HTTP;
    } else if ("https".equals(url.getProtocol())) {
      scheme = Scheme.HTTPS;
    } else {
      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
    }

    if (Http.LOG.isTraceEnabled()) {
      Http.LOG.trace("fetching " + url);
    }

    String path = url.getFile();
    if (!path.startsWith("/")) {
      path = "/" + path;
    }

    // some servers will redirect a request with a host line like
    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
    // don't want the :80...

    String host = url.getHost();
    int port;
    String portString;
    if (url.getPort() == -1) {
      if (scheme == Scheme.HTTP) {
        port = 80;
      } else {
        port = 443;
      }
      portString = "";
    } else {
      port = url.getPort();
      portString = ":" + port;
    }
    Socket socket = null;

    try {
      socket = new Socket(); // create the socket
      socket.setSoTimeout(http.getTimeout());

      // connect
      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
      socket.connect(sockAddr, http.getTimeout());

      if (scheme == Scheme.HTTPS) {
        SSLSocket sslsocket = null;

        try {
          sslsocket = getSSLSocket(socket, sockHost, sockPort);
          sslsocket.startHandshake();
        } catch (Exception e) {
          Http.LOG.debug("SSL connection to {} failed with: {}", url,
              e.getMessage());
          if ("handshake alert:  unrecognized_name".equals(e.getMessage())) {
            try {
              // Reconnect, see NUTCH-2447
              socket = new Socket();
              socket.setSoTimeout(http.getTimeout());
              socket.connect(sockAddr, http.getTimeout());
              sslsocket = getSSLSocket(socket, "", sockPort);
              sslsocket.startHandshake();
            } catch (Exception ex) {
              String msg = "SSL reconnect to " + url + " failed with: "
                  + e.getMessage();
              throw new HttpException(msg);
            }
          } else {
            String msg = "SSL connect to " + url + " failed with: "
                    + e.getMessage();
            throw new HttpException(msg, e);
          }
        }
        socket = sslsocket;
      }

      if (http.isStoreIPAddress()) {
        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
      }

      // make request
      OutputStream req = socket.getOutputStream();

      StringBuffer reqStr = new StringBuffer("GET ");
      if (http.useProxy(url)) {
        reqStr.append(url.getProtocol() + "://" + host + portString + path);
      } else {
        reqStr.append(path);
      }

      if (http.getUseHttp11()) {
        reqStr.append(" HTTP/1.1\r\n");
      } else {
        reqStr.append(" HTTP/1.0\r\n");
      }

      reqStr.append("Host: ");
      reqStr.append(host);
      reqStr.append(portString);
      reqStr.append("\r\n");

      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");

      String userAgent = http.getUserAgent();
      if ((userAgent == null) || (userAgent.length() == 0)) {
        if (Http.LOG.isErrorEnabled()) {
          Http.LOG.error("User-agent is not set!");
        }
      } else {
        reqStr.append("User-Agent: ");
        reqStr.append(userAgent);
        reqStr.append("\r\n");
      }

      String acceptLanguage = http.getAcceptLanguage();
      if (!acceptLanguage.isEmpty()) {
        reqStr.append("Accept-Language: ");
        reqStr.append(acceptLanguage);
        reqStr.append("\r\n");
      }

      String acceptCharset = http.getAcceptCharset();
      if (!acceptCharset.isEmpty()) {
        reqStr.append("Accept-Charset: ");
        reqStr.append(acceptCharset);
        reqStr.append("\r\n");
      }

      String accept = http.getAccept();
      if (!accept.isEmpty()) {
        reqStr.append("Accept: ");
        reqStr.append(accept);
        reqStr.append("\r\n");
      }

      if (http.isCookieEnabled()) {
        String cookie = null;
        
        if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
          cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
        }
        
        if (cookie == null) {
          cookie = http.getCookie(url);
        }
        
        if (cookie != null) {
          reqStr.append("Cookie: ");
          reqStr.append(cookie);
          reqStr.append("\r\n");
        }
      }

      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
        reqStr.append(HttpHeaders.IF_MODIFIED_SINCE + ": "
            + HttpDateFormat.toString(datum.getModifiedTime()));
        reqStr.append("\r\n");
      }

      // "signal that this connection will be closed after completion of the
      // response", see https://tools.ietf.org/html/rfc7230#section-6.1
      reqStr.append("Connection: close\r\n");
      reqStr.append("\r\n");

      // store the request in the metadata?
      if (http.isStoreHttpRequest()) {
        headers.add(Response.REQUEST, reqStr.toString());
      }

      byte[] reqBytes = reqStr.toString().getBytes();

      req.write(reqBytes);
      req.flush();

      PushbackInputStream in = // process response
          new PushbackInputStream(
              new BufferedInputStream(socket.getInputStream(),
                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);

      StringBuffer line = new StringBuffer();
      StringBuffer lineSeparator = new StringBuffer();

      // store the http headers verbatim
      if (http.isStoreHttpHeaders()) {
        httpHeaders = new StringBuffer();
      }

      headers.add(FETCH_TIME, Long.toString(System.currentTimeMillis()));

      boolean haveSeenNonContinueStatus = false;
      while (!haveSeenNonContinueStatus) {
        // parse status code line
        try {
          this.code = parseStatusLine(in, line, lineSeparator);
        } catch(HttpException e) {
          Http.LOG.warn("Missing or invalid HTTP status line", e);
          Http.LOG.warn("No HTTP header, assuming HTTP/0.9 for {}", getUrl());
          this.code = 200;
          in.unread(lineSeparator.toString().getBytes(StandardCharsets.ISO_8859_1));
          in.unread(line.toString().getBytes(StandardCharsets.ISO_8859_1));
          break;
        }
        if (httpHeaders != null)
          httpHeaders.append(line).append("\r\n");
        // parse headers
        parseHeaders(in, line, httpHeaders);
        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
      }

      try {
        String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
        if (transferEncoding != null
            && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
          readChunkedContent(in, line);
        } else {
          readPlainContent(in);
        }

        String contentEncoding = getHeader(Response.CONTENT_ENCODING);
        if ("gzip".equals(contentEncoding)
            || "x-gzip".equals(contentEncoding)) {
          content = http.processGzipEncoded(content, url);
        } else if ("deflate".equals(contentEncoding)) {
          content = http.processDeflateEncoded(content, url);
        } else {
          if (Http.LOG.isTraceEnabled()) {
            Http.LOG.trace("fetched " + content.length + " bytes from " + url);
          }
        }
        if (httpHeaders != null) {
          httpHeaders.append("\r\n");
          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
        }
      } catch (IOException | HttpException e) {
        // Headers parsing went fine, but an error occurred while trying to read
        // the body of the request (the body may be malformed)
        if (code != 200) {
          Http.LOG.warn(
              "Ignored exception while reading payload of response with status code "
                  + code + ":",
              e);
          content = null;
          if (httpHeaders != null) {
            httpHeaders.append("\r\n");
            headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
          }
        } else {
          // If the page is a "200 OK" response, we do not want to go further
          // with processing the invalid payload.
          throw e;
        }
      }
    } finally {
      if (socket != null)
        socket.close();
    }

  }