_checkDownloadPage

in tools/download_check.rb [371:775]


def _checkDownloadPage(path, tlp, version)
  isTLP = PMCS.include? tlp
  if version == ''
    I "Checking #{path} [#{tlp}] TLP #{isTLP} ..."
  else
    I "Checking #{path} [#{tlp}] TLP #{isTLP} for version #{version} only ..."
  end

  
  if $ALLOW_JS
    body = `/srv/whimsy/tools/render-page.js 
  else
    body = check_page(path)
  end

  return unless body

  hasDisclaimer = body.gsub(%r{\s+}, ' ').include? 'Incubation is required of all newly accepted'

  if isTLP
    W "#{tlp} has Incubator disclaimer" if hasDisclaimer
  elsif hasDisclaimer
    I "#{tlp} has Incubator disclaimer"
  else
    E "#{tlp} does not have Incubator disclaimer"
  end

  
  
  if body =~ %r{nightly|snapshot}i 
    body.scan(%r{(^.*?([^<>]+?(nightly|snapshot)[^<>]+?)).*$}i) do |m|
      m.each do |n|
        if n.size < 160
          if n =~ %r{API |/api/|-docs-} 
            W "Found reference to NIGHTLY or SNAPSHOT docs?: #{n}"
          else
            
            unless n.include? 'Fixed TO log warnings when generating snapshots' or
                  n.include? 'Kafka Raft support for snapshots' or
                  n.include? 'zkSnapshotC' or 
                  n.include? '/issues.apache.org/jira/browse/' 
              W "Found reference to NIGHTLY or SNAPSHOT builds: #{n}"
            end
          end
          break
        end
      end
    end
  end

  if body.include? 'dist.apache.org'
    E 'Page must not link to dist.apache.org'
  else
    I 'Page does not reference dist.apache.org'
  end

  if body.include? 'repository.apache.org'
    E 'Page must not link to repository.apache.org'
  else
    I 'Page does not reference repository.apache.org'
  end

  deprecated = Time.parse('2018-01-01')

  links = get_links(path, body, true)
  if links.size < 6 
    E "Page does not have enough links: #{links.size} < 6 -- perhaps it needs JavaScript?"
  end

  if $CLI
    puts 'Checking link syntax'
    links.each do |h, t|
      if h =~ %r{^([a-z]{3,6})://}
        W 'scheme? %s %s' % [h, t] unless %w(http https).include? $1
      else
        W 'syntax? %s %s' % [h, t] unless h.start_with? '//'
      end
    end
  end
  if $SHOW_LINKS
    links.each {|l| p l}
  end

  tlpQE = Regexp.escape(tlp) 
  tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr' 
  
  
  
  expurl = "https://[downloads.|www.]apache.org/[dist/][incubator/]#{tlp}/KEYS"
  expurlre = %r{^https://((www\.)?apache\.org/dist|downloads\.apache\.org)/(incubator/)?
  keys = links.select {|h, _v| h =~ expurlre}
  if keys.size >= 1
    keyurl = keys.first.first
    keytext = keys.first[1]
    if keytext.include? 'KEYS'
      I 'Found KEYS link'
    else
      W "Found KEYS: '#{keytext}'"
    end
    check_head(keyurl, :E) 
  else
    keys = links.select {|h, v| h.end_with? 'KEYS' || v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'}
    if keys.size >= 1
      I 'Found KEYS link'
      keyurl = keys.first.first
      if keyurl =~ expurlre
        I "KEYS links to #{expurl} as expected"
      elsif keyurl =~ %r{^https://www\.apache\.org/dist/
        W "KEYS: expected: #{expurl}\n             actual: #{keyurl}"
      elsif keyurl =~ %r{^https://downloads\.apache\.org/
        W "KEYS: expected: #{expurl}\n             actual: #{keyurl}"
      else
        E "KEYS: expected: #{expurl}\n             actual: #{keyurl}"
      end
      check_head(keyurl, :E) 
    else
      E 'Could not find KEYS link'
    end
  end

  hasGPGverify = false
  
  body.scan(%r{gpg --verify.+$}) { |m|
    hasGPGverify = true
    
    m = m.gsub(%r{<span [^>]+>|</span>}, '').sub(%r{(<div|<br).+},'') 
    unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+}
      W "gpg verify should specify second param: #{m.strip} see:\nhttps://www.apache.org/info/verification.html#specify_both"
    end
  }

  
  body.scan(%r{(gpg[[:space:]]+(.+?)(?:import|verify))}) { |m|
    pfx = m[1]
    unless pfx.sub(%r{<span[^>]*>}, '') == '--'
      W "gpg requires -- before qualifiers, not #{pfx.inspect}: #{m[0].strip}"
    end
  }

  
  bodytext = body.gsub(/\s+/, ' ') 
  if VERIFY_TEXT.any? {|text| bodytext.include? text}
    I 'Found reference to download verification'
  elsif hasGPGverify
    W 'Found reference to GPG verify; assuming this is part of download verification statement'
  else
    E 'Could not find statement of the need to verify downloads'
  end

  
  body.scan(%r{^.+md5sum.+$}) {|m|
    W "Found md5sum: #{m.strip}"
  }

  links.each do |h, t|
    
    if h =~ ARTIFACT_RE
      base = File.basename($1)

      if $vercheck[base]  
        W "Already seen link for #{base}"
      else
        ext = $2 
        $vercheck[base] = [h =~ %r{^https?://archive.apache.org/} ? 'archive' : (h =~ %r{https?://repo\d?\.maven(\.apache)?\.org/} ? 'maven' : 'live')]
        unless $vercheck[base].first == 'archive'
          stem = base[0..-(ext.size + 2)]
          

          if stem =~ %r{^.+?[-_]v?(\d+(?:\.\d+)+)(.*)$}
            
            
            ver = $1 
            suff = $2
            
            
            if suff =~ %r{^(-RC\d+|-rc\d+|-incubating|-ALPHA|[-.]?M\d+|[-~]?(alpha|beta)\d?(?:-\d)?)}
              ver += $1
            end
            $versions[ver][stem] << ext
          elsif stem =~ %r{netbeans-(\d+)-}i
            $versions[$1][stem] << ext
          else
            W "Cannot parse #{stem} for version"
          end
        end
      end
      
      if t.include?('.') and base != File.basename(t.sub(/[Mm]irrors? for /, '').strip)
        
        tmp = t.strip.sub(%r{.*/}, '') 
        if base == tmp
          W "Mismatch?: #{h} and '#{t}'"
        elsif base.end_with? tmp
          W "Mismatch?: #{h} and '#{tmp}'"
        elsif base.sub(/-bin\.|-src\./, '.').end_with? tmp
          W "Mismatch?: #{h} and '#{tmp}'"
        else
          W "Mismatch2: #{h}\n link: '#{base}'\n text: '#{tmp}'"
        end
      end
    end
  end

  links.each do |h, t|
    
    
    if h =~ %r{^https?://.+?/([^/]+\.(asc|sha\d+|md5|sha|mds))$}
      base = File.basename($1)
      ext = $2
      stem = base[0..-(2 + ext.length)]
      if $vercheck[stem]
        $vercheck[stem] << ext
      else
        E "Bug: found hash #{h} for missing artifact #{stem}"
      end
      next if t == '' 
      tmp = text2ext(t)
      next if ext == tmp 
      next if ext == 'sha' and tmp == 'sha1' 
      next if %w(sha256 md5 mds sha512 sha1).include?(ext) and %w(SHA digest Digest CheckSum checksums).include?(t) 
      next if ext == 'mds' and (tmp == 'hashes' or t == 'Digests')
      unless base == t or h == t 
        if t == 'Download' 
          W "Mismatch: #{h} and '#{t}'"
        elsif not %w{checksum Hash}.include? t
          if h =~ %r{^https?://archive\.apache\.org/dist/} 
              W "Mismatch: #{h} and '#{t}'"
          else
              E "Mismatch: #{h} and '#{t}'"
          end
        end
      end
    end
  end


  
  $vercheck.each do |k, w|
    v = w.dup
    typ = v.shift
    unless v.include? 'asc' and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5' or e == 'sha' or e == 'mds'}
      if typ == 'live'
        E "#{k} missing sig/hash: (found only: #{v.inspect})"
      elsif typ == 'archive' || typ == 'maven' 
        W "#{k} missing sig/hash: (found only: #{v.inspect})"
      else
        E "#{k} missing sig/hash: (found only: #{v.inspect}) TYPE=#{typ}"
      end
    end
    W "#{k} Prefer SHA* over MDS #{v.inspect}" if typ == 'live' && v.include?('mds') && v.none? {|e| e =~ /^sha\d+$/}
  end

  if @fails > 0 and not $ALWAYS_CHECK_LINKS
    W '** Not checking links **'
    $NOFOLLOW = true
  end

  
  if $versions.size == 0
    E 'Could not detect any artifact versions -- perhaps it needs JavaScript?'
  end

  

  links.each do |h, t|
    if h =~ %r{\.(asc|sha256|sha512)$}
      host, _stem, _ext = check_hash_loc(h, tlp)
      if host == 'archive'
        if $ARCHIVE_CHECK
          check_head(h, :E) 
        else
          I "Ignoring archived hash #{h}"
        end
      elsif host
        if $NOFOLLOW
          I "Skipping artifact hash #{h}"
        else
          uri, _code, _response = check_head_3(h, :E) 
          unless uri.to_s == h
            h1 = h.sub(%r{//(www\.)?apache\.org/dist/}, '//downloads.apache.org/')
            unless uri.to_s == h1
              W "Redirected hash: #{h} => #{uri}"
            end
          end
        end
      else
        
      end
    elsif h =~ ARTIFACT_RE
      name = $1
      _ext = $2
      if h =~ %r{https?://archive\.apache\.org/}
        unless $ARCHIVE_CHECK
          I "Ignoring archived artifact #{h}"
          next
        end
      end
      
      
      if h =~ %r{https?://(www\.)?apache\.org/dist} or h =~ %r{https?://downloads.apache.org/}
        E "Must use mirror system #{h}"
        next
      elsif h =~ %r{https?://repo\d\.maven\.org/.+(-src|-source)}
        E "Must use mirror system for source #{h}"
        next
      end
      if $NOFOLLOW
        I "Skipping artifact #{h}"
        next
      end
      res = check_head(h, :E, false) 
      next unless res
      
      ct = res.content_type
      cl = res.content_length
      if ct and cl
        I "#{h} OK: #{ct} #{cl}"
      else 
        path = nil
        
        if h =~ %r{^https?://(www\.)?apache\.org/dyn/.*action=download}
          res = check_page(h, :E, false, true, false)
          next unless res
          unless res.code =~ /^3\d\d$/
            E "Expected redirect, got #{res.code}"
            next
          end
          path = res['Location'] or E("Could not extract Location from #{h}")
        else
          bdy = check_page(h, :E, false)
          if bdy
            lks = get_links(path, bdy)
            lks.each do |l, _t|
              
              if l.end_with?(name) and l !~ %r{//archive\.apache\.org/}
                path = l
                break
              end
            end
            if bdy.include? 'The object is in our archive'
                W "File is archived: '#{name}' in page: '#{h}'"
                next
            end
          end
        end
        if path
          res = check_head(path, :E, false) 
          next unless res
          ct = res.content_type
          cl = res.content_length
          if ct and cl
            I "OK: #{ct} #{cl} #{path}"
          elsif cl
            I "NAK: ct='#{ct}' cl='#{cl}' #{path}"
          else
            E "NAK: ct='#{ct}' cl='#{cl}' #{path}"
          end
        else
          E "Could not find link for '#{name}' in page: '#{h}' (missing)"
        end
      end
    elsif h =~ %r{\.(md5|sha\d*)$}
      host, stem, _ext = check_hash_loc(h, tlp)
      if $NOFOLLOW
        I "Skipping deprecated hash #{h}"
        next
      end
      if %w{www downloads archive maven}.include?(host) or host == ''
        next unless $ARCHIVE_CHECK or host != 'archive'
        res = check_head(h, :E, false) 
        next unless res
        lastmod = res['last-modified']
        date = Time.parse(lastmod)
        
        if date < deprecated
          I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}"
          
        else
          unless host == 'maven' and stem.end_with? '.jar' 
            W "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}"
          end
        end
      else
        E "Unhandled host: #{host} in #{h}"
      end
    elsif h =~ %r{/KEYS$} or t == 'KEYS'
      
    elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$}
      
    elsif h =~ %r{https?://people.apache.org/phonebook.html}
    elsif h.start_with? 'https://cwiki.apache.org/confluence/'
      
    elsif h.start_with? 'https://wiki.apache.org/'
      
    elsif h.start_with? 'https://svn.apache.org/'
      
    elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/}
      W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/
    else
      
    end
  end

end