spec/integration/content_extraction

# # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one # or more contributor license agreements. Licensed under the Elastic License 2.0; # you may not use this file except in compliance with the Elastic License 2.0. # # frozen_string_literal: true RSpec.describe 'Content extractable file support' do let(:site) do Faux.site do page '/' do body do link_to '/html' link_to '/pdf' link_to '/powerpoint' link_to '/word' end end page '/html' do headers 'Content-Type' => 'text/html; charset=UTF-8' end page '/pdf' do headers 'Content-Type' => 'application/pdf' end page '/powerpoint' do headers 'Content-Type' => 'application/vnd.ms-powerpoint' end page '/word' do headers 'Content-Type' => 'application/msword' end end end it 'supports single and multiple Content-Type headers' do results = FauxCrawl.run( site, content_extraction: { enabled: true, mime_types: [ 'application/pdf', 'application/vnd.ms-powerpoint' ] } ) expect(results).to have_only_these_results [ mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), mock_response(url: 'http://127.0.0.1:9393/html', status_code: 200), mock_response(url: 'http://127.0.0.1:9393/pdf', status_code: 200), mock_response(url: 'http://127.0.0.1:9393/powerpoint', status_code: 200) ] end end

spec/integration/content_extraction_spec.rb (44 lines of code) (raw):