spec/integration/timeouts/request_timeout_spec.rb (65 lines of code) (raw):

# # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one # or more contributor license agreements. Licensed under the Elastic License 2.0; # you may not use this file except in compliance with the Elastic License 2.0. # # frozen_string_literal: true require 'socket' RSpec.describe 'Request to a site that is sending the data back really slowly' do # rubocop:disable Lint/ConstantDefinitionInBlock class VerySlowServer PORT = 10_000 RESPONSE_DURATION = 20 # seconds ROOT_PAGE = <<~HTML <html> <body> <a href="/timeout">Timeout page is here</a> </body> </html> HTML def root_url "http://127.0.0.1:#{PORT}" end def start TCPServer.open('127.0.0.1', PORT) do |serv| loop do sock = serv.accept req = sock.gets("\r\n\r\n") handle_request(req, sock) ensure sock&.close end end end def handle_request(req, sock) sock.print "HTTP/1.0 200 OK\r\n" sock.print "Content-Type: text/html\r\n" if req.start_with?('GET /timeout') # # If the server provides a content length, Apache HTTP client does not # allow us to close the connection prematurely and keeps reading the content # until it reaches the end or until 2048 bytes have been consumed # # So, to make the test faster, we do not send a content length yet. # When the issue is fixed, we should remove the comment here. # # sock.print "Content-Length: 80\r\n" # sock.print "\r\n" puts 'Slowly sending response lines...' RESPONSE_DURATION.times do |i| sleep 1 sock.print "no\r\n" puts "[#{i}] One... line... at... a... time... " end else root_page_payload = "#{ROOT_PAGE}\r\n" sock.print "Content-Length: #{root_page_payload.length}\r\n\r\n" sock.print root_page_payload end end end # rubocop:enable Lint/ConstantDefinitionInBlock it 'times out' do # Start a very slow server on a separate port slow_server = VerySlowServer.new slow_thread = Thread.new { slow_server.start } # Configure and run a crawl against the slow server results = FauxCrawl.run( Faux.site, # This will never actually be called, since we seed the crawl with the slow site timeouts: { request_timeout: 2 }, url: slow_server.root_url ) # Should only have a single result (home page) expect(results).to have_only_these_results [ mock_response(url: "#{slow_server.root_url}/", status_code: 200) ] # Should properly count visited pages stats = results.crawl.stats expect(stats.status_code_counts).to eq( '200' => 2, # robots.txt + home page '599' => 1 # /timeout ) # Should properly enforce the timeout expect(stats.time_spent_crawling_msec).to be < VerySlowServer::RESPONSE_DURATION * 1000 ensure slow_thread.kill end end