validate_robots_txt

in lib/crawler/url_validator/robots_txt_check_concern.rb [12:73]


    def validate_robots_txt 
      
      crawl_result = http_executor.run(
        Crawler::Data::CrawlTask.new(
          url: url.join('/robots.txt'),
          type: :robots_txt,
          depth: 1
        ),
        follow_redirects: true
      )

      
      if crawl_result.is_a?(Crawler::Data::CrawlResult::RedirectError)
        return validation_warn(:robots_txt, <<~MESSAGE)
          Our attempt at fetching a robots.txt file has failed with a redirect error: 
          The crawler will proceed as if the file does not exist for this domain.
        MESSAGE
      end

      
      return validation_ok(:robots_txt, "No robots.txt found for #{url}.") if crawl_result.status_code == 404

      
      if crawl_result.status_code == 599
        return validation_fail(:robots_txt, <<~MESSAGE)
          Failed to fetch robots.txt: 
          
        MESSAGE
      end

      
      if crawl_result.status_code >= 500
        return validation_fail(:robots_txt, <<~MESSAGE)
          Transient error fetching robots.txt: HTTP 
          We could not proceed with crawling this site.
        MESSAGE
      end

      
      robots_txt = Crawler::RobotsTxtService.new(user_agent: crawler_api_config.user_agent)
      robots_txt.register_crawl_result(url.domain, crawl_result)

      if robots_txt.parser_for_domain(url.domain).allow_all?
        validation_ok(:robots_txt, 'Found a robots.txt and it allows us full access to the domain.')
      else
        robots_outcome = robots_txt.url_disallowed_outcome(url)

        
        if robots_outcome.allowed?
          validation_warn(:robots_txt, <<~MESSAGE)
            Found a robots.txt file at 
            that may affect content indexing.
          MESSAGE
        else
          validation_fail(:robots_txt, <<~MESSAGE)
            Found a robots.txt file at 
            
          MESSAGE
        end
      end
    end