in lib/crawler/coordinator.rb [574:622]
def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
return unless urls.any?
allowed_urls = Set.new
added_urls_count = 0
# Check all URLs and filter out the ones we should actually crawl
urls.each do |url| # rubocop:disable Metrics/BlockLength
if shutdown_started?
system_logger.warn(<<~LOG.squish)
Received shutdown request while adding #{urls.count} URL(s) to the crawl queue.
Some URLs have been skipped and may be missed if/when the crawl resumes.
LOG
break
end
# Skip if we have already added this URL to the backlog
url = url.normalized_url
next if allowed_urls.include?(url)
# Skip unless this URL is allowed
discover_outcome = check_discovered_url(
url:,
type:,
source_url:,
crawl_depth:
)
next unless discover_outcome == :allow
allowed_urls << url
added_urls_count += 1
add_url_to_backlog(
url:,
type:,
source_type:,
crawl_depth:,
source_url:,
redirect_chain:
)
end
# Seeding complete, log about it
return unless added_urls_count.positive?
system_logger.debug("Added #{added_urls_count} URLs from a #{source_type} source to the queue...")
events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
end