public static String filterNormalize()

in src/java/org/apache/nutch/parse/ParseOutputFormat.java [401:467]


  public static String filterNormalize(String fromUrl, String toUrl,
      String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
       String ignoreExternalLinksMode, URLFilters filters,
       URLExemptionFilters exemptionFilters, URLNormalizers normalizers,
        String urlNormalizerScope) {
    // ignore links to self (or anchors within the page)
    if (fromUrl.equals(toUrl)) {
      return null;
    }
    if (ignoreExternalLinks || ignoreInternalLinks) {
      URL targetURL = null;
      try {
        targetURL = new URL(toUrl);
      } catch (MalformedURLException e1) {
        return null; // skip it
      }
      if (ignoreExternalLinks) {
        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
          //FIXME: toDomain will never be null, correct?
          if (toDomain == null || !toDomain.equals(origin)) {
            return null; // skip it
          }
        } else {
          String toHost = targetURL.getHost().toLowerCase();
          if (!toHost.equals(origin)) { // external host link
            if (exemptionFilters == null // check if it is exempted?
                || !exemptionFilters.isExempted(fromUrl, toUrl)) {
              return null; ///skip it, This external url is not exempted.
            }
          }
        }
      }
      if (ignoreInternalLinks) {
        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
          //FIXME: toDomain will never be null, correct?
          if (toDomain == null || toDomain.equals(origin)) {
            return null; // skip it
          }
        } else {
          String toHost = targetURL.getHost().toLowerCase();
          //FIXME: toDomain will never be null, correct?
          if (toHost == null || toHost.equals(origin)) {
            return null; // skip it
          }
        }
      }
    }

    try {
      if (normalizers != null) {
        toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
                                                                  // the url
      }
      if (filters != null) {
        toUrl = filters.filter(toUrl); // filter the url
      }
      if (toUrl == null) {
        return null;
      }
    } catch (Exception e) {
      return null;
    }

    return toUrl;
  }