run

in lib/crawler/http_executor.rb [40:102]


    def run(crawl_task, follow_redirects: false) 
      
      handling_http_errors(crawl_task) do
        loop do
          if crawl_task.http_url_with_auth? && !config.http_auth_allowed
            return Crawler::Data::CrawlResult::HttpAuthDisallowedError.new(url: crawl_task.url)
          end

          
          head_response = config.head_requests_enabled ? head_request(crawl_task) : nil
          get_response = head_response.nil? || head_response.error? ? get_request(crawl_task) : nil
          response = get_response || head_response

          
          if response.redirect?
            response.release_connection

            redirect_count = crawl_task.redirect_chain.size + 1
            if redirect_count > config.max_redirects
              error = <<~LOG.squish
                Not following the HTTP redirect from 
                to 
                is too long (
              LOG
              logger.warn(error)

              return Crawler::Data::CrawlResult::RedirectError.new(
                url: crawl_task.url,
                error:
              )
            end

            
            if follow_redirects
              logger.info("Following the redirect from '#{crawl_task.url}' to '#{response.redirect_location}'...")
              crawl_task = Crawler::Data::CrawlTask.new(
                url: response.redirect_location,
                redirect_chain: crawl_task.redirect_chain + [crawl_task.url],
                type: crawl_task.type,
                depth: crawl_task.depth
              )
              next
            end
          end

          
          if !crawl_task.robots_txt? && !response.redirect? && !extractable_content.include?(response.mime_type)
            response.release_connection

            return unsupported_content_type(crawl_task, response)
          end

          
          get_response = get_request(crawl_task) if get_response.nil?
          return generate_crawl_result(
            crawl_task:,
            response: get_response
          )
        end
      end
      
    end