check_discovered_url

in lib/crawler/coordinator.rb [661:738]


    def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity
      discover_event = {
        url:,
        source_url:,
        crawl_depth:
      }

      # Make sure it is an HTTP(S) link
      # FIXME: Feels like this should be a rules engine rule (which protocols to allow)
      unless url.supported_scheme?
        events.url_discover_denied(**discover_event.merge(deny_reason: :incorrect_protocol))
        return :deny
      end

      # Check URL length
      # FIXME: Feels like this should be a rules engine rule
      if url.request_uri.length > config.max_url_length
        events.url_discover_denied(**discover_event.merge(deny_reason: :link_too_long))
        return :deny
      end

      # Check URL segments limit
      # FIXME: Feels like this should be a rules engine rule
      if url.path_segments_count > config.max_url_segments
        events.url_discover_denied(**discover_event.merge(deny_reason: :link_with_too_many_segments))
        return :deny
      end

      # Check URL query parameters limit
      # FIXME: Feels like this should be a rules engine rule
      if url.params_count > config.max_url_params
        events.url_discover_denied(**discover_event.merge(deny_reason: :link_with_too_many_params))
        return :deny
      end

      # Check crawl rules to make sure we are allowed to crawl this URL
      # Please note: We check the rules before crawl-level limits, so that a single URL would
      #              retain its deny reason no matter where we find it (reduces confusion).
      # Sitemaps:    Sitemaps are treated specially and not checked against the rule engine.
      #              Otherwise they would be restricted to the same domain and also have to
      #              adhere to the configured crawl rules.
      discover_url_outcome = rule_engine.discover_url_outcome(url) unless type == :sitemap
      if discover_url_outcome&.denied?
        events.url_discover_denied(
          **discover_event.merge(
            deny_reason: discover_url_outcome.deny_reason,
            message: discover_url_outcome.message
          )
        )
        return :deny
      end

      # Check if we went deep enough and should stop here
      if crawl_depth > config.max_crawl_depth
        events.url_discover_denied(**discover_event.merge(deny_reason: :link_too_deep))
        return :deny
      end

      # Check if we have reached the limit on the number of unique URLs we have seen
      if seen_urls.count >= config.max_unique_url_count
        events.url_discover_denied(**discover_event.merge(deny_reason: :too_many_unique_links))
        return :deny
      end

      # Skip URLs we have already seen before (and enqueued for processing)
      # Warning: This should be the last check since it adds the URL to the seen_urls and
      #          we don't want to add a URL as seen if we could deny it afterwards
      unless seen_urls.add?(url)
        events.url_discover_denied(**discover_event.merge(deny_reason: :already_seen))
        return :deny
      end

      
      events.url_discover(**discover_event.merge(type: :allowed))

      :allow
    end