spec/support/faux/faux_crawl.rb (153 lines of code) (raw):
#
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License 2.0;
# you may not use this file except in compliance with the Elastic License 2.0.
#
# frozen_string_literal: true
require 'faux'
require_relative 'results_collection'
class FauxCrawl # rubocop:disable Metrics/ClassLength
module Settings
def self.faux_url
"http://#{faux_ip}:#{faux_port}"
end
def self.faux_ip
'127.0.0.1'
end
def self.faux_port
9393
end
end
#-------------------------------------------------------------------------------------------------
def self.run(*args)
new(*args).run
end
def self.crawl_site(&block)
raise ArgumentError, 'Need a block defining a site' unless block
run(Faux.site(&block))
end
#-------------------------------------------------------------------------------------------------
DEFAULT_OPTIONS = {
port: Settings.faux_port,
seed_urls: ['/']
}.freeze
START_TIMEOUT = 20.seconds
attr_reader :options, :sites, :site_containers, :timeouts, :content_extraction, :default_encoding, :crawl_id,
:url_queue, :auth, :user_agent, :url, :seed_urls, :sitemap_urls, :domain_allowlist, :results,
:expect_success
delegate :crawl, to: :results
def initialize(*sites) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
@options = sites.extract_options!
@sites = configure_sites(*sites)
@crawl_id = options.fetch(:crawl_id, BSON::ObjectId.new.to_s)
@url_queue = options.fetch(:url_queue, enterprise_search? ? :esqueues_me : :memory_only)
@user_agent = options.fetch(:user_agent, 'Faux Crawler')
@auth = options.fetch(:auth, nil)
@url = options.fetch(:url, Settings.faux_url)
@seed_urls = coerce_to_absolute_urls(options[:seed_urls] || ["#{@url}/"])
@sitemap_urls = coerce_to_absolute_urls(options[:sitemap_urls] || [])
@domain_allowlist = seed_urls.map { |url| Crawler::Data::URL.parse(url).site }
@content_extraction = options.fetch(:content_extraction, { enabled: false, mime_types: [] })
@default_encoding = options[:default_encoding]
@timeouts = options.fetch(:timeouts, {}).slice(
:connect_timeout, :socket_timeout, :request_timeout
).compact
@results = ResultsCollection.new
@expect_success = options.fetch(:expect_success, true)
start_sites
end
#-------------------------------------------------------------------------------------------------
# Returns true if we're running within the Enterprise Search solution test suite
def enterprise_search?
defined?(::Crawler::LocoMoco)
end
#-------------------------------------------------------------------------------------------------
def configure_sites(*sites)
sites.collect do |(site, opts)|
opts ||= {}
opts.reverse_merge!(@options)
opts.reverse_merge!(DEFAULT_OPTIONS)
opts[:port] = opts[:port].to_s
OpenStruct.new(opts).tap { |s| s.site = site }
end
end
#-------------------------------------------------------------------------------------------------
def start_sites # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity, Metrics/AbcSize
sites_to_start = sites.uniq(&:port)
@site_containers = sites_to_start.collect do |site|
site_options = { port: site.port, debug: true, start: false }
the_site = Faux::Site.new(site.site, site_options)
Thread.new { the_site.start }
the_site
end
start_time = Time.now
ports_remaining = sites.map(&:port)
# Wait for all sites to start or until a timeout is reached
loop do
break if ports_remaining.empty?
time_elapsed = Time.now - start_time
break if time_elapsed > START_TIMEOUT
begin
port_to_check = ports_remaining.first
response = HTTPClient.new.get("http://127.0.0.1:#{port_to_check}/status")
ports_remaining.shift if (200..299).cover?(response.status)
rescue StandardError
# Silence errors from health checks
end
sleep 0.05
end
return unless ports_remaining.any?
raise "Unable to start all Faux sites; these ports never were available: #{ports_remaining.inspect}"
end
#-------------------------------------------------------------------------------------------------
def stop_sites
site_containers.each(&:stop)
end
#-------------------------------------------------------------------------------------------------
def run
# Prepare crawl configuration
configure_crawl
# Perform the crawl
crawl.start!
# Check the outcome
raise "Test Crawl failed! Outcome: #{results.outcome_message}" if expect_success && results.outcome != :success
results
ensure
stop_sites
end
#-------------------------------------------------------------------------------------------------
def configure_crawl # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
# Prepare crawl config
config = {
crawl_id: crawl_id,
auth: auth,
user_agent: user_agent,
domains: [
{
url: url,
seed_urls: seed_urls,
sitemap_urls: sitemap_urls
}
],
binary_content_extraction_enabled: content_extraction.fetch(:enabled),
binary_content_extraction_mime_types: content_extraction.fetch(:mime_types),
output_sink: :mock,
results_collection: results,
http_auth_allowed: true,
loopback_allowed: true,
private_networks_allowed: true,
url_queue: url_queue
}
config.merge!(timeouts)
config[:default_encoding] = default_encoding if default_encoding
# Add crawl rules for Enterprise Search tests
if enterprise_search?
# Allow all traffic
config[:crawl_rules] = domain_allowlist.map do |domain|
{
policy: 'allow',
url_pattern: "\\A#{Regexp.escape(domain)}"
}
end
# Use default dedup settings
config[:deduplication_settings] = domain_allowlist.map do |domain|
{
fields: SharedTogo::Crawler.default_deduplication_fields,
url_pattern: "\\A#{Regexp.escape(domain)}"
}
end
end
# When running within the solution test suite, use the solution API
crawler_module = enterprise_search? ? ::Crawler::LocoMoco : ::Crawler
# Setup the crawler
results.crawl_config = crawler_module::API::Config.new(config)
results.crawl = crawler_module::API::Crawl.new(results.crawl_config)
end
#-------------------------------------------------------------------------------------------------
def coerce_to_absolute_urls(links)
links.map do |link|
if /^http/.match?(link)
base_url = ::Crawler::Data::URL.parse(Settings.faux_url)
base_url.join(link).to_s
else
::Crawler::Data::URL.parse(link).to_s
end
end
end
def log(message, color = :default)
puts message.colorize(color: color)
end
end