in tools/download_check.rb [371:775]
def _checkDownloadPage(path, tlp, version)
isTLP = PMCS.include? tlp
if version == ''
I "Checking #{path} [#{tlp}] TLP #{isTLP} ..."
else
I "Checking #{path} [#{tlp}] TLP #{isTLP} for version #{version} only ..."
end
if $ALLOW_JS
body = `/srv/whimsy/tools/render-page.js
else
body = check_page(path)
end
return unless body
hasDisclaimer = body.gsub(%r{\s+}, ' ').include? 'Incubation is required of all newly accepted'
if isTLP
W "#{tlp} has Incubator disclaimer" if hasDisclaimer
elsif hasDisclaimer
I "#{tlp} has Incubator disclaimer"
else
E "#{tlp} does not have Incubator disclaimer"
end
if body =~ %r{nightly|snapshot}i
body.scan(%r{(^.*?([^<>]+?(nightly|snapshot)[^<>]+?)).*$}i) do |m|
m.each do |n|
if n.size < 160
if n =~ %r{API |/api/|-docs-}
W "Found reference to NIGHTLY or SNAPSHOT docs?: #{n}"
else
unless n.include? 'Fixed TO log warnings when generating snapshots' or
n.include? 'Kafka Raft support for snapshots' or
n.include? 'zkSnapshotC' or
n.include? '/issues.apache.org/jira/browse/'
W "Found reference to NIGHTLY or SNAPSHOT builds: #{n}"
end
end
break
end
end
end
end
if body.include? 'dist.apache.org'
E 'Page must not link to dist.apache.org'
else
I 'Page does not reference dist.apache.org'
end
if body.include? 'repository.apache.org'
E 'Page must not link to repository.apache.org'
else
I 'Page does not reference repository.apache.org'
end
deprecated = Time.parse('2018-01-01')
links = get_links(path, body, true)
if links.size < 6
E "Page does not have enough links: #{links.size} < 6 -- perhaps it needs JavaScript?"
end
if $CLI
puts 'Checking link syntax'
links.each do |h, t|
if h =~ %r{^([a-z]{3,6})://}
W 'scheme? %s %s' % [h, t] unless %w(http https).include? $1
else
W 'syntax? %s %s' % [h, t] unless h.start_with? '//'
end
end
end
if $SHOW_LINKS
links.each {|l| p l}
end
tlpQE = Regexp.escape(tlp)
tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr'
expurl = "https://[downloads.|www.]apache.org/[dist/][incubator/]#{tlp}/KEYS"
expurlre = %r{^https://((www\.)?apache\.org/dist|downloads\.apache\.org)/(incubator/)?
keys = links.select {|h, _v| h =~ expurlre}
if keys.size >= 1
keyurl = keys.first.first
keytext = keys.first[1]
if keytext.include? 'KEYS'
I 'Found KEYS link'
else
W "Found KEYS: '#{keytext}'"
end
check_head(keyurl, :E)
else
keys = links.select {|h, v| h.end_with? 'KEYS' || v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'}
if keys.size >= 1
I 'Found KEYS link'
keyurl = keys.first.first
if keyurl =~ expurlre
I "KEYS links to #{expurl} as expected"
elsif keyurl =~ %r{^https://www\.apache\.org/dist/
W "KEYS: expected: #{expurl}\n actual: #{keyurl}"
elsif keyurl =~ %r{^https://downloads\.apache\.org/
W "KEYS: expected: #{expurl}\n actual: #{keyurl}"
else
E "KEYS: expected: #{expurl}\n actual: #{keyurl}"
end
check_head(keyurl, :E)
else
E 'Could not find KEYS link'
end
end
hasGPGverify = false
body.scan(%r{gpg --verify.+$}) { |m|
hasGPGverify = true
m = m.gsub(%r{<span [^>]+>|</span>}, '').sub(%r{(<div|<br).+},'')
unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+}
W "gpg verify should specify second param: #{m.strip} see:\nhttps://www.apache.org/info/verification.html#specify_both"
end
}
body.scan(%r{(gpg[[:space:]]+(.+?)(?:import|verify))}) { |m|
pfx = m[1]
unless pfx.sub(%r{<span[^>]*>}, '') == '--'
W "gpg requires -- before qualifiers, not #{pfx.inspect}: #{m[0].strip}"
end
}
bodytext = body.gsub(/\s+/, ' ')
if VERIFY_TEXT.any? {|text| bodytext.include? text}
I 'Found reference to download verification'
elsif hasGPGverify
W 'Found reference to GPG verify; assuming this is part of download verification statement'
else
E 'Could not find statement of the need to verify downloads'
end
body.scan(%r{^.+md5sum.+$}) {|m|
W "Found md5sum: #{m.strip}"
}
links.each do |h, t|
if h =~ ARTIFACT_RE
base = File.basename($1)
if $vercheck[base]
W "Already seen link for #{base}"
else
ext = $2
$vercheck[base] = [h =~ %r{^https?://archive.apache.org/} ? 'archive' : (h =~ %r{https?://repo\d?\.maven(\.apache)?\.org/} ? 'maven' : 'live')]
unless $vercheck[base].first == 'archive'
stem = base[0..-(ext.size + 2)]
if stem =~ %r{^.+?[-_]v?(\d+(?:\.\d+)+)(.*)$}
ver = $1
suff = $2
if suff =~ %r{^(-RC\d+|-rc\d+|-incubating|-ALPHA|[-.]?M\d+|[-~]?(alpha|beta)\d?(?:-\d)?)}
ver += $1
end
$versions[ver][stem] << ext
elsif stem =~ %r{netbeans-(\d+)-}i
$versions[$1][stem] << ext
else
W "Cannot parse #{stem} for version"
end
end
end
if t.include?('.') and base != File.basename(t.sub(/[Mm]irrors? for /, '').strip)
tmp = t.strip.sub(%r{.*/}, '')
if base == tmp
W "Mismatch?: #{h} and '#{t}'"
elsif base.end_with? tmp
W "Mismatch?: #{h} and '#{tmp}'"
elsif base.sub(/-bin\.|-src\./, '.').end_with? tmp
W "Mismatch?: #{h} and '#{tmp}'"
else
W "Mismatch2: #{h}\n link: '#{base}'\n text: '#{tmp}'"
end
end
end
end
links.each do |h, t|
if h =~ %r{^https?://.+?/([^/]+\.(asc|sha\d+|md5|sha|mds))$}
base = File.basename($1)
ext = $2
stem = base[0..-(2 + ext.length)]
if $vercheck[stem]
$vercheck[stem] << ext
else
E "Bug: found hash #{h} for missing artifact #{stem}"
end
next if t == ''
tmp = text2ext(t)
next if ext == tmp
next if ext == 'sha' and tmp == 'sha1'
next if %w(sha256 md5 mds sha512 sha1).include?(ext) and %w(SHA digest Digest CheckSum checksums).include?(t)
next if ext == 'mds' and (tmp == 'hashes' or t == 'Digests')
unless base == t or h == t
if t == 'Download'
W "Mismatch: #{h} and '#{t}'"
elsif not %w{checksum Hash}.include? t
if h =~ %r{^https?://archive\.apache\.org/dist/}
W "Mismatch: #{h} and '#{t}'"
else
E "Mismatch: #{h} and '#{t}'"
end
end
end
end
end
$vercheck.each do |k, w|
v = w.dup
typ = v.shift
unless v.include? 'asc' and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5' or e == 'sha' or e == 'mds'}
if typ == 'live'
E "#{k} missing sig/hash: (found only: #{v.inspect})"
elsif typ == 'archive' || typ == 'maven'
W "#{k} missing sig/hash: (found only: #{v.inspect})"
else
E "#{k} missing sig/hash: (found only: #{v.inspect}) TYPE=#{typ}"
end
end
W "#{k} Prefer SHA* over MDS #{v.inspect}" if typ == 'live' && v.include?('mds') && v.none? {|e| e =~ /^sha\d+$/}
end
if @fails > 0 and not $ALWAYS_CHECK_LINKS
W '** Not checking links **'
$NOFOLLOW = true
end
if $versions.size == 0
E 'Could not detect any artifact versions -- perhaps it needs JavaScript?'
end
links.each do |h, t|
if h =~ %r{\.(asc|sha256|sha512)$}
host, _stem, _ext = check_hash_loc(h, tlp)
if host == 'archive'
if $ARCHIVE_CHECK
check_head(h, :E)
else
I "Ignoring archived hash #{h}"
end
elsif host
if $NOFOLLOW
I "Skipping artifact hash #{h}"
else
uri, _code, _response = check_head_3(h, :E)
unless uri.to_s == h
h1 = h.sub(%r{//(www\.)?apache\.org/dist/}, '//downloads.apache.org/')
unless uri.to_s == h1
W "Redirected hash: #{h} => #{uri}"
end
end
end
else
end
elsif h =~ ARTIFACT_RE
name = $1
_ext = $2
if h =~ %r{https?://archive\.apache\.org/}
unless $ARCHIVE_CHECK
I "Ignoring archived artifact #{h}"
next
end
end
if h =~ %r{https?://(www\.)?apache\.org/dist} or h =~ %r{https?://downloads.apache.org/}
E "Must use mirror system #{h}"
next
elsif h =~ %r{https?://repo\d\.maven\.org/.+(-src|-source)}
E "Must use mirror system for source #{h}"
next
end
if $NOFOLLOW
I "Skipping artifact #{h}"
next
end
res = check_head(h, :E, false)
next unless res
ct = res.content_type
cl = res.content_length
if ct and cl
I "#{h} OK: #{ct} #{cl}"
else
path = nil
if h =~ %r{^https?://(www\.)?apache\.org/dyn/.*action=download}
res = check_page(h, :E, false, true, false)
next unless res
unless res.code =~ /^3\d\d$/
E "Expected redirect, got #{res.code}"
next
end
path = res['Location'] or E("Could not extract Location from #{h}")
else
bdy = check_page(h, :E, false)
if bdy
lks = get_links(path, bdy)
lks.each do |l, _t|
if l.end_with?(name) and l !~ %r{//archive\.apache\.org/}
path = l
break
end
end
if bdy.include? 'The object is in our archive'
W "File is archived: '#{name}' in page: '#{h}'"
next
end
end
end
if path
res = check_head(path, :E, false)
next unless res
ct = res.content_type
cl = res.content_length
if ct and cl
I "OK: #{ct} #{cl} #{path}"
elsif cl
I "NAK: ct='#{ct}' cl='#{cl}' #{path}"
else
E "NAK: ct='#{ct}' cl='#{cl}' #{path}"
end
else
E "Could not find link for '#{name}' in page: '#{h}' (missing)"
end
end
elsif h =~ %r{\.(md5|sha\d*)$}
host, stem, _ext = check_hash_loc(h, tlp)
if $NOFOLLOW
I "Skipping deprecated hash #{h}"
next
end
if %w{www downloads archive maven}.include?(host) or host == ''
next unless $ARCHIVE_CHECK or host != 'archive'
res = check_head(h, :E, false)
next unless res
lastmod = res['last-modified']
date = Time.parse(lastmod)
if date < deprecated
I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}"
else
unless host == 'maven' and stem.end_with? '.jar'
W "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}"
end
end
else
E "Unhandled host: #{host} in #{h}"
end
elsif h =~ %r{/KEYS$} or t == 'KEYS'
elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$}
elsif h =~ %r{https?://people.apache.org/phonebook.html}
elsif h.start_with? 'https://cwiki.apache.org/confluence/'
elsif h.start_with? 'https://wiki.apache.org/'
elsif h.start_with? 'https://svn.apache.org/'
elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/}
W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/
else
end
end
end