tools/download

#!/usr/bin/env ruby # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. =begin Checks a download page URL for compliance with ASF guidelines. Note: the GUI interface is currently at www/members/download_check.cgi =end $LOAD_PATH.unshift '/srv/whimsy/lib' require 'whimsy/asf' require 'wunderbar' require 'net/http' require 'nokogiri' require 'time' =begin Checks performed: (F=fatal, E=error, W=warn) TBA =end $CLI = false $VERBOSE = false $ARCHIVE_CHECK = false $ALWAYS_CHECK_LINKS = false $NO_CHECK_LINKS = false $NOFOLLOW = false # may be reset $ALLOW_HTTP = false # http links generate Warning, not Error $FAIL_FAST = false $SHOW_LINKS = false $VERSION = nil # Check archives have hash and sig $vercheck = {} # key = archive name, value = array of [type, hash/sig...] # collect versions for summary display $versions = Hash.new {|h1, k1| h1[k1] = Hash.new {|h2, k2| h2[k2] = Array.new} } # key = version, value = Hash, key = arch basename, value = array of [extensions] # match an artifact # TODO detect artifacts by URL as well if possible # $1 = base, $2 = extension # OOO SF links end in /download ARTIFACT_RE = %r{/([^/]+\.(pom|crate|tar|tar\.xz|tar\.gz|deb|nbm|dmg|sh|zip|tgz|far|tar\.bz2|jar|whl|war|msi|exe|rar|rpm|nar|xml|vsix))([&?]action=download|/download)?$} def init # build a list of validation errors @tests = [] @fails = 0 if $NO_CHECK_LINKS $NOFOLLOW = true I 'Will not check links' elsif $ALWAYS_CHECK_LINKS I 'Will check links even if download page has errors' else I 'Will check links if download page has no errors' end I 'Will %s archive.apache.org links in checks' % ($ARCHIVE_CHECK ? 'include' : 'not include') end # save the result of a test def test(severity, txt) @tests << {severity => txt} unless severity == :I or severity == :W @fails += 1 if $FAIL_FAST puts txt caller.each {|c| puts c} exit! end end end def F(txt) test(:F, txt) end def E(txt) test(:E, txt) end def W(txt) test(:W, txt) end def I(txt) test(:I, txt) end # extract test entries with key k def tests(k) @tests.map {|t| t[k]}.compact end # extract test entries with key k def testentries(k) @tests.select {|t| t[k]}.compact end def showList(list, header) unless list.empty? _h2_ header _ul do list.each { |item| _li item } end end end def displayHTML fatals = tests(:F) errors = tests(:E) warns = tests(:W) if !fatals.empty? _h2_.bg_danger "The page at #{@url} failed our checks:" elsif !errors.empty? _h2_.bg_warning "The page at #{@url} has some problems:" elsif !warns.empty? _h2_.bg_warning "The page at #{@url} has some minor issues" else _h2_.bg_success "The page at #{@url} looks OK, thanks for using this service" end if @fails > 0 showList(fatals, 'Fatal errors:') showList(errors, 'Errors:') end showList(warns, 'Warnings:') _h2_ 'Tests performed' _ol do @tests.each { |t| t.map {|k, v| _li "#{k}: - #{v}"}} end _h4_ 'F: fatal, E: Error, W: warning, I: info (success)' end def check_url(url) uri = URI.parse(url) unless uri.scheme W "No scheme for URL #{url}, assuming http" uri = URI.parse('http:' + url) end return uri if %w{http https}.include? uri.scheme raise ArgumentError.new("Unexpected url: #{url}") end # Return uri, code|nil, response|error def fetch_url(url, method=:head, depth=0, followRedirects=true) # string input uri = URI.parse(url) begin Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |https| case method when :head request = Net::HTTP::Head.new(uri.request_uri) when :get request = Net::HTTP::Get.new(uri.request_uri) else raise "Invalid method #{method}" end response = https.request(request) if followRedirects and response.code =~ /^3\d\d/ return uri, nil, "Too many redirects: #{depth} > 3" if depth > 3 fetch_url response['location'], method, depth + 1 # string else return uri, response.code, response end end rescue Exception => e return uri, nil, e end end # Head an HTTP URL => uri, code, response def HEAD(url) puts ">> HEAD #{url}" if $VERBOSE fetch_url(url, :head) end # get an HTTP URL => response def GET(url, followRedirects=true) puts ">> GET #{url}" if $VERBOSE fetch_url(url, :get, 0, followRedirects)[2] end # Check page exists => uri, code, response|nil def check_head_3(path, severity = :E, log=true) uri, code, response = HEAD(path) if code == '403' # someone does not like Whimsy? W "HEAD #{path} - HTTP status: #{code} - retry" uri, code, response = HEAD(path) end unless code == '200' test(severity, "HEAD #{path} - HTTP status: #{code}") unless severity.nil? return uri, code, nil end I "Checked HEAD #{path} - OK (#{code})" if log return uri, code, response end # Check page exists => response or nil def check_head(path, severity = :E, log=true) check_head_3(path, severity, log)[2] end # check page can be read => body or response or nil def check_page(path, severity=:E, log=true, returnRes=false, followRedirects=true) response = GET(path, followRedirects) code = response.code || '?' unless code == '200' or (!followRedirects and code =~ /^3\d\d/) test(severity, "GET #{path} - HTTP status: #{code}") unless severity.nil? return nil end I "Checked GET #{path} - OK (#{code})" if log puts "Fetched #{path} - OK (#{code})" if $CLI return returnRes ? response : response.body end def WE(msg) if $ALLOW_HTTP W msg else E msg end end # returns www|archive, stem and the hash extension def check_hash_loc(h, tlp) tlpQE = Regexp.escape(tlp) # in case of meta-chars tlpQE = "(?:ooo|#{tlpQE})" if tlp == 'openoffice' tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr' # temporary override tlpQE = '(?:tubemq|inlong)' if tlp == 'inlong' # renamed tlpQE = '(?:hadoop/)?ozone' if tlp == 'ozone' # moved if h =~ %r{^(https?)://(?:(archive|www)\.)?apache\.org/dist/(?:incubator/)?#{tlpQE}/.*?([^/]+)\.(\w{3,6})$} WE "HTTPS! #{h}" unless $1 == 'https' return $2 || '', $3, $4 # allow for no host before apache.org # Allow // after .org (pulsar) elsif h =~ %r{^(https?)://(downloads)\.apache\.org//?(?:incubator/)?#{tlpQE}/.*?([^/]+)\.(\w{3,6})$} WE "HTTPS! #{h}" unless $1 == 'https' return $2, $3, $4 # https://repo1.maven.org/maven2/org/apache/shiro/shiro-spring/1.1.0/shiro-spring-1.1.0.jar.asc elsif h =~ %r{^(https?)://repo1?\.(maven)(?:\.apache)?\.org/maven2/org/apache/#{tlpQE}/.+/([^/]+\.(?:jar|xml))\.(\w{3,6})$} # Maven WE "HTTPS! #{h}" unless $1 == 'https' W "Unexpected hash location #{h} for #{tlp}" unless ($vercheck[$3][0] rescue '') == 'maven' return $2, $3, $4 else if h =~ %r{-bin-} W "Unexpected bin hash location #{h} for #{tlp}" else E "Unexpected hash location #{h} for #{tlp}" end nil end end # get the https? links as Array of [href, text] def get_links(path, body, checkSpaces=false) doc = Nokogiri::HTML(body) nodeset = doc.css('a[href]') # Get anchors w href attribute via css nodeset.map { |node| tmp = node.attribute('href').to_s href = tmp.strip if checkSpaces && tmp != href W "Spurious space(s) in '#{tmp}'" end if href =~ %r{^?Preferred=https?://} href = path + URI.decode_www_form_component(href) end # Strip spurious text from link (age, baremaps) text = node.text.gsub(/[[:space:]]+/, ' ').sub('(opens in a new tab)', '').sub('➚', '').strip [href, text] unless href =~ %r{/httpcomponents.+/xdoc/downloads.xml} # breadcrumb link to source }.select {|x, _y| x =~ %r{^(https?:)?//} } end VERIFY_TEXT = [ 'the integrity of the downloaded files', 'Verify Authenticity of Downloads', 'verify the integrity', # commons has this as a link; perhaps try converting page to text only? 'verify that checksums and signatures are correct', '#verifying-signature', 'check that the download has completed OK', 'You should verify your download', 'downloads can be verified', 'www.apache.org/info/verification', 'www.apache.org/dyn/closer.cgi#verify', 'verify your mirrored downloads', 'verify your downloads', 'verify the downloaded files', 'All downloads should be verified', 'verification instructions', ' encouraged to verify ', 'To check a GPG signature', 'To verify Hadoop', 'Instructions for verifying your mirrored downloads', # fineract 'How to verify the download?', # OOO ] ALIASES = { 'sig' => 'asc', 'pgp' => 'asc', 'gpg' => 'asc', 'pgpasc' => 'asc', 'sign' => 'asc', 'signature' => 'asc', 'signature(.asc)' => 'asc', 'ascsignature' => 'asc', 'pgpsignature' => 'asc', 'pgpsignatures' => 'asc', 'gpgsignature' => 'asc', 'openpgpsignature' => 'asc', } # Need to be able to check if download is for a PMC or podling # parameter is the website URL # Also want to convert site to TLP URL2TLP = {} # URL host to TLP conversion URL2TLP['jspwiki-wiki'] = 'jspwiki' # https://jspwiki-wiki.apache.org/Wiki.jsp?page=Downloads URL2TLP['xmlbeans'] = 'poi' # xmlbeans now being maintained by POI PMCS = Set.new # is this a TLP? ASF::Committee.pmcs.map do |p| name = p.name PMCS << name if p.site site = p.site[%r{//(.+?)\.apache\.org}, 1] URL2TLP[site] = name unless site == name else Wunderbar.warn "PMC has no site: #{name}" end end # Convert text reference to extension # e.g. SHA256 => sha256; [SIG] => asc def text2ext(txt) # need to strip twice to handle ' [ asc ] ' # TODO: perhaps just remove all white-space? tmp = txt.downcase.strip.sub(%r{^\.}, '').sub(%r{^\[(.+)\]$}, '\1').sub('-', ''). sub(/ ?(digest|checksum)/, '').sub(/ $tar\.gz$| $zip$| /, ''). sub('(opens new window)', ''). # doris strip return 'sha256' if tmp =~ %r{\A[A-Fa-f0-9]{64}\z} return 'sha512' if tmp =~ %r{\A[A-Fa-f0-9]{128}\z} ALIASES[tmp] || tmp end # Suite: perform all the HTTP checks def checkDownloadPage(path, tlp, version) begin _checkDownloadPage(path.strip, tlp, version) rescue Exception => e F e if $CLI p e puts e.backtrace end end end def _checkDownloadPage(path, tlp, version) isTLP = PMCS.include? tlp if version == '' I "Checking #{path} [#{tlp}] TLP #{isTLP} ..." else I "Checking #{path} [#{tlp}] TLP #{isTLP} for version #{version} only ..." end # check the main body if $ALLOW_JS body = `/srv/whimsy/tools/render-page.js #{path}` else body = check_page(path) end return unless body hasDisclaimer = body.gsub(%r{\s+}, ' ').include? 'Incubation is required of all newly accepted' if isTLP W "#{tlp} has Incubator disclaimer" if hasDisclaimer elsif hasDisclaimer I "#{tlp} has Incubator disclaimer" else E "#{tlp} does not have Incubator disclaimer" end # Some pages are mainly a single line (e.g. Hop) # This make matching the appropriate match context tricky without traversing the DOM if body =~ %r{nightly|snapshot}i # scan can be expensive, so skip if unneeded body.scan(%r{(^.*?([^<>]+?(nightly|snapshot)[^<>]+?)).*$}i) do |m| m.each do |n| if n.size < 160 if n =~ %r{API |/api/|-docs-} # snapshot docs Datasketches (Flink)? W "Found reference to NIGHTLY or SNAPSHOT docs?: #{n}" else # ignore trafficcontrol bugfix message unless n.include? 'Fixed TO log warnings when generating snapshots' or n.include? 'Kafka Raft support for snapshots' or n.include? 'zkSnapshotC' or # ZooKeepeer n.include? '/issues.apache.org/jira/browse/' # Daffodil W "Found reference to NIGHTLY or SNAPSHOT builds: #{n}" end end break end end end end if body.include? 'dist.apache.org' E 'Page must not link to dist.apache.org' else I 'Page does not reference dist.apache.org' end if body.include? 'repository.apache.org' E 'Page must not link to repository.apache.org' else I 'Page does not reference repository.apache.org' end deprecated = Time.parse('2018-01-01') links = get_links(path, body, true) if links.size < 6 # source+binary * archive+sig+hash E "Page does not have enough links: #{links.size} < 6 -- perhaps it needs JavaScript?" end if $CLI puts 'Checking link syntax' links.each do |h, t| if h =~ %r{^([a-z]{3,6})://} W 'scheme? %s %s' % [h, t] unless %w(http https).include? $1 else W 'syntax? %s %s' % [h, t] unless h.start_with? '//' end end end if $SHOW_LINKS links.each {|l| p l} end tlpQE = Regexp.escape(tlp) # in case of meta-chars tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr' # temporary override # check KEYS link # TODO: is location used by hc allowed, e.g. # https://www.apache.org/dist/httpcomponents/httpclient/KEYS expurl = "https://[downloads.|www.]apache.org/[dist/][incubator/]#{tlp}/KEYS" expurlre = %r{^https://((www\.)?apache\.org/dist|downloads\.apache\.org)/(incubator/)?#{tlpQE}/KEYS$} keys = links.select {|h, _v| h =~ expurlre} if keys.size >= 1 keyurl = keys.first.first keytext = keys.first[1] if keytext.include? 'KEYS' I 'Found KEYS link' else W "Found KEYS: '#{keytext}'" end check_head(keyurl, :E) # log else keys = links.select {|h, v| h.end_with? 'KEYS' || v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'} if keys.size >= 1 I 'Found KEYS link' keyurl = keys.first.first if keyurl =~ expurlre I "KEYS links to #{expurl} as expected" elsif keyurl =~ %r{^https://www\.apache\.org/dist/#{tlpQE}/[^/]+/KEYS$} W "KEYS: expected: #{expurl}\n actual: #{keyurl}" elsif keyurl =~ %r{^https://downloads\.apache\.org/#{tlpQE}/[^/]+/KEYS$} W "KEYS: expected: #{expurl}\n actual: #{keyurl}" else E "KEYS: expected: #{expurl}\n actual: #{keyurl}" end check_head(keyurl, :E) # log else E 'Could not find KEYS link' end end hasGPGverify = false # Check if GPG verify has two parameters body.scan(%r{gpg --verify.+$}) { |m| hasGPGverify = true # Hack to tidy matched text: drop spans and truncate at <br> or <div> m = m.gsub(%r{<span [^>]+>|</span>}, '').sub(%r{(<div|<br).+},'') # sub! returns nil if no change unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+} W "gpg verify should specify second param: #{m.strip} see:\nhttps://www.apache.org/info/verification.html#specify_both" end } # Look for incorrect gpg qualifiers body.scan(%r{(gpg[[:space:]]+(.+?)(?:import|verify))}) { |m| pfx = m[1] unless pfx.sub(%r{<span[^>]*>}, '') == '--' W "gpg requires -- before qualifiers, not #{pfx.inspect}: #{m[0].strip}" end } # check for verify instructions bodytext = body.gsub(/\s+/, ' ') # single line if VERIFY_TEXT.any? {|text| bodytext.include? text} I 'Found reference to download verification' elsif hasGPGverify W 'Found reference to GPG verify; assuming this is part of download verification statement' else E 'Could not find statement of the need to verify downloads' end # check if page refers to md5sum body.scan(%r{^.+md5sum.+$}) {|m| W "Found md5sum: #{m.strip}" } links.each do |h, t| # These might also be direct links to mirrors if h =~ ARTIFACT_RE base = File.basename($1) # puts "base: " + base if $vercheck[base] # might be two links to same archive W "Already seen link for #{base}" else ext = $2 # save for use after RE match $vercheck[base] = [h =~ %r{^https?://archive.apache.org/} ? 'archive' : (h =~ %r{https?://repo\d?\.maven(\.apache)?\.org/} ? 'maven' : 'live')] unless $vercheck[base].first == 'archive' stem = base[0..-(ext.size + 2)] # version must include '.', e.g. xxx-m.n.oyyy # Apache_OpenOffice-SDK_4.1.10_Linux_x86-64_install-deb_en-US if stem =~ %r{^.+?[-_]v?(\d+(?:\.\d+)+)(.*)$} # $1 = version # $2 any suffix, e.g. -bin, -src (or other) ver = $1 # main version suff = $2 # does version have a suffix such as beta1, M3 etc? # jmeter needs _ here; brpc uses rc02 if suff =~ %r{^(-RC\d+|-rc\d+|-incubating|-ALPHA|[-.]?M\d+|[-~]?(alpha|beta)\d?(?:-\d)?)} ver += $1 end $versions[ver][stem] << ext elsif stem =~ %r{netbeans-(\d+)-}i $versions[$1][stem] << ext else W "Cannot parse #{stem} for version" end end end # Text must include a '.' (So we don't check 'Source') if t.include?('.') and base != File.basename(t.sub(/[Mm]irrors? for /, '').strip) # text might be short version of link tmp = t.strip.sub(%r{.*/}, '') # if base == tmp W "Mismatch?: #{h} and '#{t}'" elsif base.end_with? tmp W "Mismatch?: #{h} and '#{tmp}'" elsif base.sub(/-bin\.|-src\./, '.').end_with? tmp W "Mismatch?: #{h} and '#{tmp}'" else W "Mismatch2: #{h}\n link: '#{base}'\n text: '#{tmp}'" end end end end links.each do |h, t| # Must occur before mirror check below # match all hashes and sigs here (invalid locations are detected later) if h =~ %r{^https?://.+?/([^/]+\.(asc|sha\d+|md5|sha|mds))$} base = File.basename($1) ext = $2 stem = base[0..-(2 + ext.length)] if $vercheck[stem] $vercheck[stem] << ext else E "Bug: found hash #{h} for missing artifact #{stem}" end next if t == '' # empire-db tmp = text2ext(t) next if ext == tmp # i.e. link is just the type or [TYPE] next if ext == 'sha' and tmp == 'sha1' # historic next if %w(sha256 md5 mds sha512 sha1).include?(ext) and %w(SHA digest Digest CheckSum checksums).include?(t) # generic next if ext == 'mds' and (tmp == 'hashes' or t == 'Digests') unless base == t or h == t # Allow for full path to sig/hash if t == 'Download' # MXNet W "Mismatch: #{h} and '#{t}'" elsif not %w{checksum Hash}.include? t if h =~ %r{^https?://archive\.apache\.org/dist/} # only warn for archives W "Mismatch: #{h} and '#{t}'" else E "Mismatch: #{h} and '#{t}'" end end end end end # did we find all required elements? $vercheck.each do |k, w| v = w.dup typ = v.shift unless v.include? 'asc' and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5' or e == 'sha' or e == 'mds'} if typ == 'live' E "#{k} missing sig/hash: (found only: #{v.inspect})" elsif typ == 'archive' || typ == 'maven' # Maven does not include recent hash types; so warn only W "#{k} missing sig/hash: (found only: #{v.inspect})" else E "#{k} missing sig/hash: (found only: #{v.inspect}) TYPE=#{typ}" end end W "#{k} Prefer SHA* over MDS #{v.inspect}" if typ == 'live' && v.include?('mds') && v.none? {|e| e =~ /^sha\d+$/} end if @fails > 0 and not $ALWAYS_CHECK_LINKS W '** Not checking links **' $NOFOLLOW = true end # Still check links if versions not seen if $versions.size == 0 E 'Could not detect any artifact versions -- perhaps it needs JavaScript?' end # Check if the links can be read links.each do |h, t| if h =~ %r{\.(asc|sha256|sha512)$} host, _stem, _ext = check_hash_loc(h, tlp) if host == 'archive' if $ARCHIVE_CHECK check_head(h, :E) # log else I "Ignoring archived hash #{h}" end elsif host if $NOFOLLOW I "Skipping artifact hash #{h}" else uri, _code, _response = check_head_3(h, :E) # log unless uri.to_s == h h1 = h.sub(%r{//(www\.)?apache\.org/dist/}, '//downloads.apache.org/') unless uri.to_s == h1 W "Redirected hash: #{h} => #{uri}" end end end else # will have been reported by check_hash_loc end elsif h =~ ARTIFACT_RE name = $1 _ext = $2 if h =~ %r{https?://archive\.apache\.org/} unless $ARCHIVE_CHECK I "Ignoring archived artifact #{h}" next end end # Ideally would like to check for use of closer.lua/.cgi, but some projects pre-populate the pages # TODO: would it help to check host against mirrors.list? if h =~ %r{https?://(www\.)?apache\.org/dist} or h =~ %r{https?://downloads.apache.org/} E "Must use mirror system #{h}" next elsif h =~ %r{https?://repo\d\.maven\.org/.+(-src|-source)} E "Must use mirror system for source #{h}" next end if $NOFOLLOW I "Skipping artifact #{h}" next end res = check_head(h, :E, false) # nolog next unless res # if HEAD returns content_type and length it's probably a direct link ct = res.content_type cl = res.content_length if ct and cl I "#{h} OK: #{ct} #{cl}" else # need to try to download the mirror page path = nil # action=download needs special handling if h =~ %r{^https?://(www\.)?apache\.org/dyn/.*action=download} res = check_page(h, :E, false, true, false) next unless res unless res.code =~ /^3\d\d$/ E "Expected redirect, got #{res.code}" next end path = res['Location'] or E("Could not extract Location from #{h}") else bdy = check_page(h, :E, false) if bdy lks = get_links(path, bdy) lks.each do |l, _t| # Don't want to match archive server (closer.cgi defaults to it if file is not found) if l.end_with?(name) and l !~ %r{//archive\.apache\.org/} path = l break end end if bdy.include? 'The object is in our archive' W "File is archived: '#{name}' in page: '#{h}'" next end end end if path res = check_head(path, :E, false) # nolog next unless res ct = res.content_type cl = res.content_length if ct and cl I "OK: #{ct} #{cl} #{path}" elsif cl I "NAK: ct='#{ct}' cl='#{cl}' #{path}" else E "NAK: ct='#{ct}' cl='#{cl}' #{path}" end else E "Could not find link for '#{name}' in page: '#{h}' (missing)" end end elsif h =~ %r{\.(md5|sha\d*)$} host, stem, _ext = check_hash_loc(h, tlp) if $NOFOLLOW I "Skipping deprecated hash #{h}" next end if %w{www downloads archive maven}.include?(host) or host == '' next unless $ARCHIVE_CHECK or host != 'archive' res = check_head(h, :E, false) # nolog next unless res lastmod = res['last-modified'] date = Time.parse(lastmod) # Check if older than 2018? if date < deprecated I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}" # OK else unless host == 'maven' and stem.end_with? '.jar' # Maven has yet to be upgraded... W "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}" end end else E "Unhandled host: #{host} in #{h}" end elsif h =~ %r{/KEYS$} or t == 'KEYS' # already handled elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$} # standard links elsif h =~ %r{https?://people.apache.org/phonebook.html} elsif h.start_with? 'https://cwiki.apache.org/confluence/' # Wiki elsif h.start_with? 'https://wiki.apache.org/' # Wiki elsif h.start_with? 'https://svn.apache.org/' # E "Public download pages should not link to unreleased code: #{h}" # could be a sidebar/header link elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/} W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/#{tlpQE}/} else # Ignore everything else? end end end def getTLP(url) # convert URL to TLP/podling if url =~ %r{^https?://cwiki\.apache\.org/confluence/display/(\S+)/} tlp = $1.downcase elsif url =~ %r{^https?://([^.]+)(\.incubator|\.us|\.eu)?\.apache\.org/} tlp = URL2TLP[$1] || $1 elsif url =~ %r{^https?://([^.]+)\.openoffice\.org/} tlp = 'openoffice' else tlp = nil F "Unknown TLP for URL #{url}" end tlp end # Called by GUI when POST is pushed def doPost(options) $ALWAYS_CHECK_LINKS = options[:checklinks] $NO_CHECK_LINKS = options[:nochecklinks] $ARCHIVE_CHECK = options[:archivecheck] init url = options[:url] tlp = options[:tlp] tlp = getTLP(url) if tlp == '' if tlp checkDownloadPage(url, tlp, options[:version]) end displayHTML end if __FILE__ == $0 $CLI = true $VERBOSE = true $ALWAYS_CHECK_LINKS = ARGV.delete '--always' $NO_CHECK_LINKS = ARGV.delete '--nolinks' $ARCHIVE_CHECK = ARGV.delete '--archivecheck' $ALLOW_HTTP = ARGV.delete '--http' $FAIL_FAST = ARGV.delete '--ff' $SHOW_LINKS = ARGV.delete '--show-links' $ALLOW_JS = ARGV.delete '--js-allow' # check for any unhandled options ARGV.each do |arg| if arg.start_with? '--' raise ArgumentError.new("Invalid option #{arg}; expecting always|nolinks|archivecheck|http|ff|show-links") end end init version = '' url = ARGV[0] if ARGV.size == 1 tlp = getTLP(url) else tlp = ARGV[1] version = ARGV[2] || '' end checkDownloadPage(url, tlp, version) # display the test results as text puts '' puts '=================' puts '' @tests.each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} puts '' testentries(:W).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} testentries(:E).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} testentries(:F).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} puts '' # Only show in CLI version for now puts 'Version summary' $versions.sort.each do |k, v| puts k v.sort.each do |l, w| puts " #{l} #{w}" end end puts '' if @fails > 0 puts "NAK: #{url} had #{@fails} errors" else puts "OK: #{url} passed all the tests" end puts '' end

tools/download_check.rb (693 lines of code) (raw):