in lib/crawler/url_validator/url_content_check_concern.rb [12:54]
def validate_url_content
validate_url_request unless url_crawl_result
return validation_warn_from_crawl_redirect if url_crawl_result.redirect?
return validation_fail_from_crawl_error unless url_crawl_result.is_a?(Crawler::Data::CrawlResult::HTML)
body = url_crawl_result.document_body
if body.empty?
warning = "The web page at #{url} did not return enough content to index."
validation_warn(:url_content, warning)
else
validation_ok(
:url_content,
"Successfully extracted some content from #{url}.",
title: url_crawl_result.document_title(limit: crawler_api_config.max_title_size),
keywords: url_crawl_result.meta_keywords(limit: crawler_api_config.max_keywords_size),
description: url_crawl_result.meta_description(limit: crawler_api_config.max_description_size),
body_size_bytes: body.bytesize
)
end
links = url_crawl_result.links(limit: 10)
if links.any?
validation_ok(
:url_content,
"Successfully extracted some links from #{url}.",
links_sample: links
)
else
validation_warn(:url_content, <<~MESSAGE)
The web page at
that have 'rel="nofollow"' set).
This means we will have no content to index other than this one page.
MESSAGE
end
end