private List readConfiguration()

in src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java [227:288]


  private List<Rule> readConfiguration(Reader reader) {
    List<Rule> rules = new ArrayList<Rule>();
    try {

      // borrowed heavily from code in Configuration.java
      Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
          .parse(new InputSource(reader));
      Element root = doc.getDocumentElement();
      if ((!"regex-normalize".equals(root.getTagName()))
          && (LOG.isErrorEnabled())) {
        LOG.error("bad conf file: top-level element not <regex-normalize>");
      }
      NodeList regexes = root.getChildNodes();
      for (int i = 0; i < regexes.getLength(); i++) {
        Node regexNode = regexes.item(i);
        if (!(regexNode instanceof Element))
          continue;
        Element regex = (Element) regexNode;
        if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {
          LOG.warn("bad conf file: element not <regex>");
        }
        NodeList fields = regex.getChildNodes();
        String patternValue = null;
        String subValue = null;
        for (int j = 0; j < fields.getLength(); j++) {
          Node fieldNode = fields.item(j);
          if (!(fieldNode instanceof Element))
            continue;
          Element field = (Element) fieldNode;
          if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
            patternValue = ((Text) field.getFirstChild()).getData();
          if ("substitution".equals(field.getTagName())
              && field.hasChildNodes())
            subValue = ((Text) field.getFirstChild()).getData();
          if (!field.hasChildNodes())
            subValue = "";
        }
        if (patternValue != null && subValue != null) {
          Rule rule = new Rule();
          try {
            rule.pattern = Pattern.compile(patternValue);
          } catch (PatternSyntaxException e) {
            if (LOG.isErrorEnabled()) {
              LOG.error("skipped rule: " + patternValue + " -> " + subValue
                  + " : invalid regular expression pattern: " + e);
            }
            continue;
          }
          rule.substitution = subValue;
          rules.add(rule);
        }
      }
    } catch (Exception e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("error parsing conf file: " + e);
      }
      return EMPTY_RULES;
    }
    if (rules.size() == 0)
      return EMPTY_RULES;
    return rules;
  }