in lib/crawler/url_validator/robots_txt_check_concern.rb [12:73]
def validate_robots_txt
crawl_result = http_executor.run(
Crawler::Data::CrawlTask.new(
url: url.join('/robots.txt'),
type: :robots_txt,
depth: 1
),
follow_redirects: true
)
if crawl_result.is_a?(Crawler::Data::CrawlResult::RedirectError)
return validation_warn(:robots_txt, <<~MESSAGE)
Our attempt at fetching a robots.txt file has failed with a redirect error:
The crawler will proceed as if the file does not exist for this domain.
MESSAGE
end
return validation_ok(:robots_txt, "No robots.txt found for #{url}.") if crawl_result.status_code == 404
if crawl_result.status_code == 599
return validation_fail(:robots_txt, <<~MESSAGE)
Failed to fetch robots.txt:
MESSAGE
end
if crawl_result.status_code >= 500
return validation_fail(:robots_txt, <<~MESSAGE)
Transient error fetching robots.txt: HTTP
We could not proceed with crawling this site.
MESSAGE
end
robots_txt = Crawler::RobotsTxtService.new(user_agent: crawler_api_config.user_agent)
robots_txt.register_crawl_result(url.domain, crawl_result)
if robots_txt.parser_for_domain(url.domain).allow_all?
validation_ok(:robots_txt, 'Found a robots.txt and it allows us full access to the domain.')
else
robots_outcome = robots_txt.url_disallowed_outcome(url)
if robots_outcome.allowed?
validation_warn(:robots_txt, <<~MESSAGE)
Found a robots.txt file at
that may affect content indexing.
MESSAGE
else
validation_fail(:robots_txt, <<~MESSAGE)
Found a robots.txt file at
MESSAGE
end
end
end