in lib/crawler/coordinator.rb [174:201]
def load_robots_txt(domain)
crawl_task = Crawler::Data::CrawlTask.new(
url: domain.robots_txt_url,
type: :robots_txt,
depth: 1
)
crawl_task.authorization_header = config.http_header_service.authorization_header_for_url(crawl_task.url)
crawl_result = execute_task(crawl_task, follow_redirects: true)
if crawl_result.is_a?(Crawler::Data::CrawlResult::RedirectError)
system_logger.warn(
"Treating a robots.txt redirect error for #{domain} as a 404 response: #{crawl_result.error}"
)
crawl_result = Crawler::Data::CrawlResult::Error.new(
url: crawl_result.url,
error: crawl_result.error,
status_code: 404
)
elsif crawl_result.error?
system_logger.warn("Error while fetching robots.txt for #{domain}: #{crawl_result.error}")
else
system_logger.debug("Fetched robots.txt for #{domain} from '#{crawl_result.url}'")
end
crawl_result
end