in lib/crawler/http_executor.rb [40:102]
def run(crawl_task, follow_redirects: false)
handling_http_errors(crawl_task) do
loop do
if crawl_task.http_url_with_auth? && !config.http_auth_allowed
return Crawler::Data::CrawlResult::HttpAuthDisallowedError.new(url: crawl_task.url)
end
head_response = config.head_requests_enabled ? head_request(crawl_task) : nil
get_response = head_response.nil? || head_response.error? ? get_request(crawl_task) : nil
response = get_response || head_response
if response.redirect?
response.release_connection
redirect_count = crawl_task.redirect_chain.size + 1
if redirect_count > config.max_redirects
error = <<~LOG.squish
Not following the HTTP redirect from
to
is too long (
LOG
logger.warn(error)
return Crawler::Data::CrawlResult::RedirectError.new(
url: crawl_task.url,
error:
)
end
if follow_redirects
logger.info("Following the redirect from '#{crawl_task.url}' to '#{response.redirect_location}'...")
crawl_task = Crawler::Data::CrawlTask.new(
url: response.redirect_location,
redirect_chain: crawl_task.redirect_chain + [crawl_task.url],
type: crawl_task.type,
depth: crawl_task.depth
)
next
end
end
if !crawl_task.robots_txt? && !response.redirect? && !extractable_content.include?(response.mime_type)
response.release_connection
return unsupported_content_type(crawl_task, response)
end
get_response = get_request(crawl_task) if get_response.nil?
return generate_crawl_result(
crawl_task:,
response: get_response
)
end
end
end