public String normalize()

in src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java [171:260]


  public String normalize(String urlString, String scope)
      throws MalformedURLException {
    
    if ("".equals(urlString)) // permit empty
      return urlString;

    urlString = urlString.trim(); // remove extra spaces

    URL url = new URL(urlString);

    String protocol = url.getProtocol();
    String host = url.getHost();
    int port = url.getPort();
    String file = url.getFile();

    boolean changed = false;
    boolean normalizePath = false;

    if (!urlString.startsWith(protocol)) // protocol was lowercased
      changed = true;

    if ("http".equals(protocol) || "https".equals(protocol)
        || "ftp".equals(protocol)) {

      if (host != null && url.getAuthority() != null) {
        String newHost = normalizeHostName(host);
        if (!host.equals(newHost)) {
          host = newHost;
          changed = true;
        } else if (!url.getAuthority().equals(newHost)) {
          // authority (http://<...>/) contains other elements (port, user,
          // etc.) which will likely cause a change if left away
          changed = true;
        }
      } else {
        // no host or authority: recompose the URL from components
        changed = true;
      }

      if (port == url.getDefaultPort()) { // uses default port
        port = -1; // so don't specify it
        changed = true;
      }

      normalizePath = true;
      if (file == null || "".equals(file)) {
        file = "/";
        changed = true;
        normalizePath = false; // no further path normalization required
      } else if (!file.startsWith("/")) {
        file = "/" + file;
        changed = true;
        normalizePath = false; // no further path normalization required
      }

      if (url.getRef() != null) { // remove the ref
        changed = true;
      }

    } else if (protocol.equals("file")) {
      normalizePath = true;
    }

    // properly encode characters in path/file using percent-encoding
    String file2 = unescapePath(file);
    file2 = escapePath(file2);
    if (!file.equals(file2)) {
      changed = true;
      file = file2;
    }

    if (normalizePath) {
      // check for unnecessary use of "/../", "/./", and "//"
      if (changed) {
        url = new URL(protocol, host, port, file);
      }
      file2 = getFileWithNormalizedPath(url);
      if (!file.equals(file2)) {
        changed = true;
        file = file2;
      }
    }

    if (changed) {
      url = new URL(protocol, host, port, file);
      urlString = url.toString();
    }

    return urlString;
  }