extract_links

in lib/crawler/coordinator.rb [511:537]


    def extract_links(crawl_result, crawl_depth:)
      extracted_links = crawl_result.extract_links(limit: config.max_extracted_links_count)
      links, limit_reached = extracted_links.values_at(:links, :limit_reached)
      system_logger.warn("Too many links on the page '#{crawl_result.url}'") if limit_reached

      Set.new.tap do |good_links|
        links.each do |link|
          unless link.valid?
            system_logger.warn("Failed to parse a link '#{link.link}' on '#{crawl_result.url}': #{link.error}")
            next
          end

          if link.rel_nofollow? || crawl_result.meta_nofollow?
            events.url_discover_denied(
              url: link.to_url,
              source_url: crawl_result.url,
              crawl_depth:,
              deny_reason: :nofollow
            )
            next
          end

          good_links << link.to_url
        end
      end
    end