spec/lib/crawler/http_client_spec.rb (318 lines of code) (raw):
#
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License 2.0;
# you may not use this file except in compliance with the Elastic License 2.0.
#
# frozen_string_literal: true
require 'webrick/httpproxy'
RSpec.describe(Crawler::HttpClient) do
let(:client_config) do
{
loopback_allowed: false,
private_networks_allowed: false,
logger: Crawler::Logging::CrawlLogger.new
}
end
let(:client) { Crawler::HttpClient.new(client_config) }
#-------------------------------------------------------------------------------------------------
let(:site_server_settings) do
{
port: 12_345,
debug: true
}
end
let(:site) { Faux.site { page '/' } }
let(:site_server) { Faux::Site.new(site, site_server_settings) }
#-------------------------------------------------------------------------------------------------
# Stubs DNS resolution in HTTP client to always return localhost IPs
def stub_http_resolver!
allow_any_instance_of(Crawler::HttpUtils::FilteringDnsResolver).to receive(:resolve) do |resolver, _host|
resolver.default_resolver.resolve('localhost')
end
end
# Runs a given block of code with a site running on localhost:12345
def with_example_site
site_server # start the site
stub_http_resolver!
sleep(1)
yield
ensure
site_server.stop
end
#-------------------------------------------------------------------------------------------------
context '#get' do
def get(url)
client.get(Crawler::Data::URL.parse(url))
end
def expect_result(url, result_code)
result = get(url)
expect(result).to be_a(Crawler::HttpUtils::Response)
expect(result.code).to eq(result_code)
end
def expect_success(url)
expect_result(url, 200)
end
it 'should work' do
expect_success('https://www.elastic.co')
end
it 'should not follow redirects automatically' do
result = get('http://www.elastic.co')
expect(result.code).to eq(301)
expect(result.headers['location']).to eq('https://www.elastic.co/')
end
it 'rejects loopback addresses' do
expect do
get('http://localhost:9200').body
end.to raise_error(Crawler::HttpUtils::InvalidHost)
end
it 'rejects private addresses' do
expect do
get('http://monitoring.swiftype.net').body
end.to raise_error(Crawler::HttpUtils::InvalidHost)
end
#-----------------------------------------------------------------------------------------------
context 'with a configured timeout' do
let(:client_config) do
super().merge(
loopback_allowed: true,
connection_request_timeout: 10
)
end
context 'for a slow site' do
let(:site) { Faux.site { page('/') { sleep 15 } } }
it 'should timeout' do
with_example_site do
expect do
get('http://localhost:12345')
end.to raise_error(Crawler::HttpUtils::SocketTimeout, /Read timed out/)
end
end
end
context 'for a site that responds before configured timeout' do
let(:site) { Faux.site { page('/') { sleep 5 } } }
it 'should not timeout' do
with_example_site do
expect_success('http://localhost:12345')
end
end
end
end
#-----------------------------------------------------------------------------------------------
context 'with a proxy server configuration' do
let(:proxy_port) { 12_346 }
let(:client_config) do
super().merge(
loopback_allowed: true,
http_proxy_host: 'localhost',
http_proxy_port: proxy_port
)
end
let(:proxy_requests) { [] }
let(:proxy_handler) do
proc do |request, _response|
proxy_requests << request
end
end
let(:proxy_auth_proc) { nil }
let(:proxy) do
WEBrick::HTTPProxyServer.new(
Port: proxy_port,
AccessLog: [
[$stderr, WEBrick::AccessLog::COMMON_LOG_FORMAT],
[$stderr, WEBrick::AccessLog::REFERER_LOG_FORMAT]
],
ProxyContentHandler: proxy_handler,
ProxyAuthProc: proxy_auth_proc
)
end
def with_proxy
Thread.new { proxy.start }
sleep(1)
yield
ensure
proxy.shutdown
end
it 'should use the proxy' do
with_proxy do
with_example_site do
expect_result('http://localhost:12345', 200)
expect_result('http://localhost:12345/hello', 404)
expect(proxy_requests.count).to eq(2)
expect(proxy_requests.map(&:path)).to eq(['/', '/hello'])
end
end
end
context 'with proxy auth enabled' do
let(:proxy_user) { 'hello_user' }
let(:proxy_pass) { 'paZZZwd' }
let(:client_config) do
super().merge(
http_proxy_username: proxy_user,
http_proxy_password: proxy_pass
)
end
# Make sure the proxy checks auth headers
let(:proxy_auth_proc) do
proc do |req, res|
auth = req['proxy-authorization']
unless auth
# First time HTTP client sends a request, it won't include the auth header
# We respond with a 407 here and ask for credentials, forcing the client to retry
res['Proxy-Authenticate'] = 'Basic realm="WEBrick Proxy"'
raise WEBrick::HTTPStatus::ProxyAuthenticationRequired, 'No auth header!'
end
_auth_type, auth_string = auth.split(' ', 2)
user, password = Base64.strict_decode64(auth_string).split(':', 2)
raise WEBrick::HTTPStatus::ProxyAuthenticationRequired unless user == proxy_user && password == proxy_pass
end
end
it 'should work' do
with_proxy do
with_example_site do
expect_result('http://localhost:12345', 200)
expect_result('http://localhost:12345/hello', 404)
expect(proxy_requests.count).to eq(2)
expect(proxy_requests.map(&:path)).to eq(['/', '/hello'])
end
end
end
context 'with invalid proxy credentials' do
let(:client_config) do
super().merge(http_proxy_password: 'banana')
end
it 'should fail properly' do
with_proxy do
with_example_site do
expect_result('http://localhost:12345', 407)
expect(proxy_requests).to be_empty
end
end
end
end
end
end
#-----------------------------------------------------------------------------------------------
context 'content encoding' do
let(:client_config) do
super().merge(loopback_allowed: true)
end
let(:mock_requests) { [] }
let(:mock_handler) do
proc do |request, _response|
mock_requests << request
end
end
let(:mock_server) do
WEBrick::HTTPServer.new(
Port: 12_347,
RequestCallback: mock_handler
)
end
def with_mock_server
Thread.new { mock_server.start }
sleep(1)
yield
ensure
mock_server.shutdown
end
it 'should set the Accept-Encoding header by default' do
with_mock_server do
get('http://localhost:12347/')
expect(mock_requests.first.accept_encoding.sort).to eq(
Crawler::HttpClient::CONTENT_DECODERS.keys.sort
)
end
end
context 'with compression disabled' do
let(:client_config) do
super().merge(compression_enabled: false)
end
it 'should not set the Accept-Encoding header' do
with_mock_server do
get('http://localhost:12347/')
expect(mock_requests.first.accept_encoding.sort).to be_empty
end
end
end
end
#-----------------------------------------------------------------------------------------------
context 'with SSL settings' do
let(:ca_certs) { [] }
let(:ssl_mode) { 'full' }
let(:url) { 'https://example.org' }
let(:domains) { [{ url: }] }
let(:crawler_config) do
Crawler::API::Config.new(
domains:,
ssl_ca_certificates: ca_certs,
ssl_verification_mode: ssl_mode
)
end
let(:client_config) do
super().merge(
loopback_allowed: true,
ssl_ca_certificates: crawler_config.ssl_ca_certificates,
ssl_verification_mode: crawler_config.ssl_verification_mode
)
end
let(:ssl_fixture) { 'self-signed' }
let(:site_server_settings) do
super().merge(
ssl: true,
ssl_certificate: fixture_file('ssl', ssl_fixture, 'example.crt'),
ssl_key: fixture_file('ssl', ssl_fixture, 'example.key')
)
end
it 'should work with public certificates' do
expect_success('https://www.elastic.co')
end
it 'should fail SSL handshake with self-signed certs' do
with_example_site do
expect { get('https://example.org:12345') }.to raise_error(
Crawler::HttpUtils::SslException,
/unable to find valid certification path/
)
end
end
context 'when custom CA certs are configured' do
let(:ca_certs) { [fixture_file('ssl', 'ca.crt')] }
it 'should still work with public certificates' do
expect_success('https://www.elastic.co')
end
it 'should work with sites signed with the configured CA' do
with_example_site do
expect_success('https://example.org:12345')
end
end
it 'should validate server names' do
with_example_site do
expect { get('https://localhost:12345') }.to raise_error(
Crawler::HttpUtils::SslException,
/doesn't match common name of the certificate subject/
)
end
end
context 'when seeing an expired SSL certificate' do
let(:ssl_fixture) { 'expired' }
it 'should fail' do
with_example_site do
expect { get('https://example.org:12345') }.to raise_error(
Crawler::HttpUtils::SslCertificateExpiredError,
/SSL certificate expired/
)
end
end
end
context 'with ssl_verification_mode=certificate' do
let(:ssl_mode) { 'certificate' }
it 'should not validate server names' do
with_example_site do
expect_success('https://localhost:12345')
end
end
context 'when seeing an expired SSL certificate' do
let(:ssl_fixture) { 'expired' }
it 'should fail' do
with_example_site do
expect { get('https://example.org:12345') }.to raise_error(
Crawler::HttpUtils::SslCertificateExpiredError,
/SSL certificate expired/
)
end
end
end
end
context 'with ssl_verification_mode=none' do
let(:ssl_mode) { 'none' }
it 'should not validate server names' do
with_example_site do
expect_success('https://localhost:12345')
end
end
context 'when seeing an expired SSL certificate' do
let(:ssl_fixture) { 'expired' }
it 'should ignore the failure' do
with_example_site do
expect_success('https://localhost:12345')
end
end
end
end
end
end
end
end