public ParseResult getParse()

in src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java [147:265]


  public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();

    // parse the content
    DocumentFragment root;
    try {
      byte[] contentInOctets = content.getContent();
      InputSource input = new InputSource(new ByteArrayInputStream(
          contentInOctets));

      EncodingDetector detector = new EncodingDetector(conf);
      detector.autoDetectClues(content, true);
      detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
      String encoding = detector.guessEncoding(content, defaultCharEncoding);

      metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
      metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);

      input.setEncoding(encoding);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Parsing...");
      }
      root = parse(input);
    } catch (IOException e) {
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
      LOG.error("Error: ", e);
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);

    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());

    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = base;
      String baseTagHref = utils.getBase(root);
      if (baseTagHref != null) {
        try {
          baseTag = new URL(base, baseTagHref);
        } catch (MalformedURLException e) {
          baseTag = base;
        }
      }
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in "
            + content.getUrl());
      }
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks,
        content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
        new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta()
            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
  }