validate_url_content

in lib/crawler/url_validator/url_content_check_concern.rb [12:54]


    def validate_url_content 
      
      validate_url_request unless url_crawl_result

      
      return validation_warn_from_crawl_redirect if url_crawl_result.redirect?

      
      return validation_fail_from_crawl_error unless url_crawl_result.is_a?(Crawler::Data::CrawlResult::HTML)

      
      body = url_crawl_result.document_body
      if body.empty?
        warning = "The web page at #{url} did not return enough content to index."
        validation_warn(:url_content, warning)
      else
        validation_ok(
          :url_content,
          "Successfully extracted some content from #{url}.",
          title: url_crawl_result.document_title(limit: crawler_api_config.max_title_size),
          keywords: url_crawl_result.meta_keywords(limit: crawler_api_config.max_keywords_size),
          description: url_crawl_result.meta_description(limit: crawler_api_config.max_description_size),
          body_size_bytes: body.bytesize
        )
      end

      
      links = url_crawl_result.links(limit: 10)
      if links.any?
        validation_ok(
          :url_content,
          "Successfully extracted some links from #{url}.",
          links_sample: links
        )
      else
        validation_warn(:url_content, <<~MESSAGE)
          The web page at 
          that have 'rel="nofollow"' set).
          This means we will have no content to index other than this one page.
        MESSAGE
      end
    end