in lib/crawler/coordinator.rb [661:738]
def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity
discover_event = {
url:,
source_url:,
crawl_depth:
}
# Make sure it is an HTTP(S) link
# FIXME: Feels like this should be a rules engine rule (which protocols to allow)
unless url.supported_scheme?
events.url_discover_denied(**discover_event.merge(deny_reason: :incorrect_protocol))
return :deny
end
# Check URL length
# FIXME: Feels like this should be a rules engine rule
if url.request_uri.length > config.max_url_length
events.url_discover_denied(**discover_event.merge(deny_reason: :link_too_long))
return :deny
end
# Check URL segments limit
# FIXME: Feels like this should be a rules engine rule
if url.path_segments_count > config.max_url_segments
events.url_discover_denied(**discover_event.merge(deny_reason: :link_with_too_many_segments))
return :deny
end
# Check URL query parameters limit
# FIXME: Feels like this should be a rules engine rule
if url.params_count > config.max_url_params
events.url_discover_denied(**discover_event.merge(deny_reason: :link_with_too_many_params))
return :deny
end
# Check crawl rules to make sure we are allowed to crawl this URL
# Please note: We check the rules before crawl-level limits, so that a single URL would
# retain its deny reason no matter where we find it (reduces confusion).
# Sitemaps: Sitemaps are treated specially and not checked against the rule engine.
# Otherwise they would be restricted to the same domain and also have to
# adhere to the configured crawl rules.
discover_url_outcome = rule_engine.discover_url_outcome(url) unless type == :sitemap
if discover_url_outcome&.denied?
events.url_discover_denied(
**discover_event.merge(
deny_reason: discover_url_outcome.deny_reason,
message: discover_url_outcome.message
)
)
return :deny
end
# Check if we went deep enough and should stop here
if crawl_depth > config.max_crawl_depth
events.url_discover_denied(**discover_event.merge(deny_reason: :link_too_deep))
return :deny
end
# Check if we have reached the limit on the number of unique URLs we have seen
if seen_urls.count >= config.max_unique_url_count
events.url_discover_denied(**discover_event.merge(deny_reason: :too_many_unique_links))
return :deny
end
# Skip URLs we have already seen before (and enqueued for processing)
# Warning: This should be the last check since it adds the URL to the seen_urls and
# we don't want to add a URL as seen if we could deny it afterwards
unless seen_urls.add?(url)
events.url_discover_denied(**discover_event.merge(deny_reason: :already_seen))
return :deny
end
events.url_discover(**discover_event.merge(type: :allowed))
:allow
end