private static final void getMetaTagsHelper()

in src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java [57:196]


  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
      URL currURL) {

    if (node.getNodeType() == Node.ELEMENT_NODE) {

      if ("body".equalsIgnoreCase(node.getNodeName())) {
        // META tags should not be under body
        return;
      }

      if ("meta".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node nameNode = null;
        Node equivNode = null;
        Node contentNode = null;
        // Retrieves name, http-equiv and content attribues
        for (int i = 0; i < attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
          if (attrName.equals("name")) {
            nameNode = attr;
          } else if (attrName.equals("http-equiv")) {
            equivNode = attr;
          } else if (attrName.equals("content")) {
            contentNode = attr;
          }
        }

        if (nameNode != null) {
          if (contentNode != null) {
            String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
            if (Nutch.ROBOTS_METATAG.equals(name)) {
              String directives = contentNode.getNodeValue()
                  .toLowerCase(Locale.ROOT);
              int index = directives.indexOf("none");

              if (index >= 0) {
                metaTags.setNoIndex();
                metaTags.setNoFollow();
              }

              index = directives.indexOf("all");
              if (index >= 0) {
                // do nothing...
              }

              index = directives.indexOf("noindex");
              if (index >= 0) {
                metaTags.setNoIndex();
              }

              index = directives.indexOf("nofollow");
              if (index >= 0) {
                metaTags.setNoFollow();
              }

              index = directives.indexOf("noarchive");
              if (index >= 0) {
                metaTags.setNoCache();
              }

            } // end if (name == robots)
            // meta names added/transformed by Tika
            else if (name.equals("pragma")) {
              String content = contentNode.getNodeValue()
                  .toLowerCase(Locale.ROOT);
              if (content.contains("no-cache")) {
                metaTags.setNoCache();
              }
            } else if (name.equals("refresh")) {
              String content = contentNode.getNodeValue()
                  .toLowerCase(Locale.ROOT);
              setRefresh(metaTags, content, currURL);
            } else if (name.equals("content-location")) {
              String urlString = contentNode.getNodeValue();
              URL url = null;
              try {
                if (currURL == null) {
                  url = new URL(urlString);
                } else {
                  url = new URL(currURL, urlString);
                }
                metaTags.setBaseHref(url);
              } catch (MalformedURLException e) {
                // ignore, base-href not set
              }
            }
          }
        }

        if (equivNode != null) {
          if (contentNode != null) {
            String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
            String content = contentNode.getNodeValue();
            metaTags.getHttpEquivTags().setProperty(name, content);
            if ("pragma".equals(name)) {
              content = content.toLowerCase(Locale.ROOT);
              int index = content.indexOf("no-cache");
              if (index >= 0)
                metaTags.setNoCache();
            } else if ("refresh".equals(name)) {
              setRefresh(metaTags, content, currURL);
            }
          }
        }

      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node hrefNode = attrs.getNamedItem("href");

        if (hrefNode != null) {
          String urlString = hrefNode.getNodeValue();

          URL url = null;
          try {
            if (currURL == null)
              url = new URL(urlString);
            else
              url = new URL(currURL, urlString);
          } catch (Exception e) {
            ;
          }

          if (url != null)
            metaTags.setBaseHref(url);
        }

      }

    }

    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        getMetaTagsHelper(metaTags, children.item(i), currURL);
      }
    }
  }