public ProtocolOutput getProtocolOutput()

in src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java [354:443]


  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {

    String urlString = url.toString();
    try {
      URL u = new URL(urlString);

      long startTime = System.currentTimeMillis();
      Response response = getResponse(u, datum, false); // make a request

      if (this.responseTime) {
        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
      }

      int code = response.getCode();
      datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
          new Text(Integer.toString(code)));

      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
          (content == null ? EMPTY_CONTENT : content),
          response.getHeader("Content-Type"), response.getHeaders(), this.mimeTypes);

      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it

      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null)
          location = response.getHeader("location");
        if (location == null)
          location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
        case 300: // multiple choices, preferred value in Location
          protocolStatusCode = ProtocolStatus.MOVED;
          break;
        case 301: // moved permanently
        case 305: // use proxy (Location is URL of proxy)
          protocolStatusCode = ProtocolStatus.MOVED;
          break;
        case 302: // found (temporarily moved)
        case 303: // see other (redirect after POST)
        case 307: // temporary redirect
          protocolStatusCode = ProtocolStatus.TEMP_MOVED;
          break;
        case 304: // not modified
          protocolStatusCode = ProtocolStatus.NOTMODIFIED;
          break;
        default:
          protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (this.logger.isTraceEnabled()) {
          this.logger.trace("400 Bad request: {}", u);
        }
        return new ProtocolOutput(c,
            new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth
                                // provided.
        if (this.logger.isTraceEnabled()) {
          this.logger.trace("401 Authentication Required");
        }
        return new ProtocolOutput(c,
            new ProtocolStatus(ProtocolStatus.ACCESS_DENIED,
                "Authentication required: " + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c,
            new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
            "Http: " + code + " url=" + u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(
            ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
      }
    } catch (Throwable e) {
      if (this.logger.isDebugEnabled() || !this.logUtil.logShort(e)) {
        this.logger.error("Failed to get protocol output", e);
      } else {
        this.logger.error("Failed to get protocol output: {}",
            e.getClass().getName());
      }
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }