#!/usr/bin/env ruby # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. =begin Checks a mirror URL for compliance with ASF mirroring guidelines. Derived partly from https://svn.apache.org/repos/asf/infrastructure/site-tools/trunk/mirrors/check_mirror.pl TODO this is a work in progress... Ideally the causes of some of the problems should be reported ... Note: the GUI interface is currently at www/members/mirror_check.cgi =end require 'wunderbar' require 'net/http' =begin Checks performed: (F=fatal, E=error, W=warn) F: zzz/time.txt is readable F: its contents is a number followed by text W: test whether time is more than 1 day old W: test its content-type (missing or text/plain) F: BASE is readable and non-empty W: has html + body headers and body/html trailers W: body matches m!<(img|IMG) (src|SRC)="/icons/! E: check index against TLP list m!> ?$dir/?Apache Software Foundation Distribution Meta-Directory/ E: footer must match /This directory contains meta-data for the ASF mirroring system./ E: mirror-tests/ must exist W: its files must not have content-encoding: 1mb.img.7z 1mb.img.bz2 1mb.img.tar.gz 1mb.img.tgz 1mb.img.zip W: zzz/mirror-tests/redirect-test/ should redirect to http://www.apache.org/ (302) TODO - any more checks? =end URLMATCH = %r!^https?://[^/]+/(\S+/)?$!i HTTPDIRS = %w(zzz/ zzz/mirror-tests/) # must exist HDRMATCH = %r!Apache Software Foundation Distribution Meta-Directory! # must be on the zzz index page FTRMATCH = %r!This directory contains meta-data for the ASF mirroring system.! # must be on the zzz index page HASHDR = %r!]+)?>.+?!im HASFTR = %r!.*?!im HTTPDIR = 'zzz/' # must appear in index page HTTP404 = 'zzz/___'; # Non-existent URL; should generate 404 HTTPTEXT = 'zzz/README'; # text file (without extension) should generate Content-Type text/plain or none MIRRORTEST = 'zzz/mirror-tests/'; MIRRORTEST_FILES = %w(1mb.img.7z 1mb.img.bz2 1mb.img.tar.gz 1mb.img.tgz 1mb.img.zip) # no Content-Encoding ! # save the result of a test def test(severity, txt) @tests << {severity => txt} @fails+=1 unless severity == :I end def F(txt) test(:F, txt) end def E(txt) test(:E, txt) end def W(txt) test(:W, txt) end def I(txt) test(:I, txt) end # extract test entries with key k def tests(k) @tests.map{|t| t[k]}.compact end # get an HTTP URL def getHTTPHdrs(url) uri = URI.parse(url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = uri.scheme == 'https' request = Net::HTTP::Head.new(uri.request_uri) http.request(request) end def check_redirect(base, page, expectedLocation, severity=:W, expectedStatus = '302', log=true) path = base + page response = getHTTPHdrs(path) if response.code != expectedStatus test severity, "HTTP status #{response.code} for '#{path}'" unless severity == nil return nil end if response['location'] != expectedLocation test severity, "HTTP location #{response['location']} for '#{path}' - expected '#{expectedLocation}'" unless severity == nil return nil end I "Fetched #{path} - redirected OK to #{response['location']}" if log response end def check_CT(base, page, severity=:E, expectedStatus = '200') path = base + page response = getHTTPHdrs(path) if response.code != expectedStatus test severity, "HTTP status #{response.code} for '#{path}'" unless severity == nil return nil end ct = response['Content-Type'] || 'unknown' ce = response['Content-Encoding'] # TODO also check CT - some mirrors return text/plain for img?? if ce W "Checking #{path} - Content-Type: #{ct} WARN: Content-Encoding: #{ce}" else I "Checking #{path} - Content-Type: #{ct}" end end # get an HTTP URL=> response def getHTTP(url) uri = URI.parse(url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = uri.scheme == 'https' request = Net::HTTP::Get.new(uri.request_uri) http.request(request) end # check page can be read => body def check_page(base, page, severity=:E, expectedStatus='200', log=true) path = base + page response = getHTTP(path) code = response.code || '?' if code != expectedStatus test(severity, "Fetched #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil return nil end I "Fetched #{path} - OK" if log response.body end def checkIndex(page, type) asfData = @pages[type] links = parseIndexPage(page) if type == :tlps fav = links.index('favicon.ico') zzz = links.index('zzz') if fav and zzz if fav < zzz W "Index for #{type}: incorrect #{type} page order - found favicon.ico before zzz/; folders should be listed before files" else I "Index for #{type}: found favicon.ico and zzz/ in the page in the correct order (i.e. folders are listed before files)" end else W "Index for #{type}: expected to find favicon.ico #{fav} and zzz/ #{zzz} in the page, but at least one is missing" end end links.each {|l| W "Index for #{type}: the link #{l} is not shown on ASF site" unless asfData.include? l } asfData.each {|l| W "Index for #{type}: the link #{l} is not shown on the mirror site" unless links.include? l or l == 'openoffice' } end # nginx activemq/-2019-Nov-25 18:00 # ASF [DIR]

accumulo/2019-08-07 23:42 - # parse an HTTP server Index page => array of file/folder names def parseIndexPage(page) folders = [] # ASF main page references currently look like this: abdera/ # the Perl script looked for this match: m!> ?$dir/?([.a-z0-9-]+)/?}) links.each { |l| if l[1] == l[0] folders << l[1] end } folders end # Check page has sensible headers and footers def checkHdrFtr(path, body) hasHTMLhdr = HASHDR.match(body) hasHTMLftr = HASFTR.match(body) if hasHTMLhdr if hasHTMLftr I "#{path} has header and footer" else W "#{path} is incomplete - no footer found" end else # no header if hasHTMLftr W "#{path} is incomplete - no header found" else W "#{path} is incomplete - no header or footer found" end end end # Suite: perform all the HTTP checks def checkHTTP(base) # We don't check the pattern on the form for two reasons: # - not all browsers support it # - allows the input to be more flexible # Fix up the URL base.strip! base += '/' unless base.end_with? '/' base = 'http://' + base unless base.start_with? 'http' # Now check the syntax: I "Checking #{base} ..." unless URLMATCH.match(base) F "Invalid URL syntax: #{base}" return end setup response = getHTTPHdrs(base) server = response['server'] if server =~ /Apache/ I "Server: #{server}" else W "Server: '#{server}' - expected 'Apache' in server response" end # Check the mirror time (and that zzz/ is readable) time = check_page(base, 'zzz/time.txt', severity = :F) if time match = /^(\d+) \S+$/.match(time) if match now = Time.now.to_i stamp = match[1].to_i age = (now - stamp)/60 # minutes if age > 60*24 W "Mirror is over 1 day old: #{age} minutes" else I "Mirror is less than 1 day old: #{age} minutes" end else F "Invalid time.txt contents: #{time}" end else return # cannot process further (already recorded the error end # check the main body body = check_page(base, '') checkHdrFtr(base, body) if %r{<(img|IMG) (src|SRC)="/icons/}.match(body) I 'Index page has icons as expected' else W 'Missing or unexpected img icon tags' end checkIndex(body, :tlps) ibody = check_page(base, 'incubator/') checkHdrFtr(base+'incubator/', ibody) checkIndex(ibody, :podlings) check_page(base, 'harmony/', :E, expectedStatus='404') zbody = check_page(base, HTTPDIR) # Not sure this is useful on its own anymore # It was originally used to detect sites with advertising wrappers, # but most recent examples have been tables around directory listings # which is obviously OK as it does not affect the user experience. # if %r{ tlps, :podlings => podlings} end def showList(list, header) unless list.empty? _h2_ header _ul do list.each { |item| _li item } end end end def display fatals = tests(:F) errors = tests(:E) warns = tests(:W) if !fatals.empty? _h2_.bg_danger "The mirror at #@url failed our checks:" elsif !errors.empty? _h2_.bg_warning "The mirror at #@url has some problems:" elsif !warns.empty? _h2_.bg_warning "The mirror at #@url has some minor issues" else _h2_.bg_success "The mirror at #@url looks OK, thanks for using this service" end if @fails > 0 showList(fatals, 'Fatal errors:') showList(errors, 'Errors:') showList(warns, 'Warnings:') # Cannot easily copy/paste URLs; use layout suitable for copy/paste into e.g. JIRA issue/e-mail _p do _ 'Please see the Apache mirror configuration instructions [1] for further details on configuring your mirror server.' end _p do _ '[1] ' _a 'http://www.apache.org/info/how-to-mirror.html#Configuration', href: 'http://www.apache.org/info/how-to-mirror.html#Configuration' end end _h2_ 'Tests performed' _ol do @tests.each { |t| t.map{|k,v| _li "#{k}: - #{v}"}} end _h4_ 'F: fatal, E: Error, W: warning, I: info (success)' end # Called by GUI when POST is pushed def doPost(url) init checkHTTP(url) display end if __FILE__ == $0 init url = ''+ARGV[0] || 'localhost' # easier to test in an IDE checkHTTP(url) # display the test results @tests.each { |t| t.map{|k, v| puts "#{k}: - #{v}"}} if @fails > 0 puts "#{url} had #{@fails} errors" else puts "#{url} passed all the tests" end end