add_urls_to_backlog

in lib/crawler/coordinator.rb [574:622]


    def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
      return unless urls.any?

      allowed_urls = Set.new
      added_urls_count = 0

      # Check all URLs and filter out the ones we should actually crawl
      urls.each do |url| # rubocop:disable Metrics/BlockLength
        if shutdown_started?
          system_logger.warn(<<~LOG.squish)
            Received shutdown request while adding #{urls.count} URL(s) to the crawl queue.
            Some URLs have been skipped and may be missed if/when the crawl resumes.
          LOG
          break
        end

        # Skip if we have already added this URL to the backlog
        url = url.normalized_url
        next if allowed_urls.include?(url)

        # Skip unless this URL is allowed
        discover_outcome = check_discovered_url(
          url:,
          type:,
          source_url:,
          crawl_depth:
        )
        next unless discover_outcome == :allow

        allowed_urls << url
        added_urls_count += 1

        add_url_to_backlog(
          url:,
          type:,
          source_type:,
          crawl_depth:,
          source_url:,
          redirect_chain:
        )
      end

      # Seeding complete, log about it
      return unless added_urls_count.positive?

      system_logger.debug("Added #{added_urls_count} URLs from a #{source_type} source to the queue...")
      events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
    end