in tools/site-scan.rb [68:260]
def parse(id, site, name, podling=false)
show_anyway = Time.now.gmtime.strftime('%H') == '08'
data = {}
site.sub!(%r{^http:},'https:')
SiteStandards::COMMON_CHECKS.each_key do |k|
data[k.to_sym] = nil
end
data[:display_name] = name
data[:uri] = site
uri = URI.parse(site)
begin
Socket.getaddrinfo(uri.host, uri.scheme)
rescue SocketError => se
data[:errors] = se.message
return data
end
begin
uri, response, status = $cache.get(site.to_s)
rescue IOError => ioe
data[:errors] = ioe.message
return data
end
puts "#{id} #{uri} #{status}"
if response.respond_to? :code and response.code =~ /^[45]/
data[:errors] = "cache.get(#{site}) error code #{response.code}"
return data
end
doc = Nokogiri::HTML(response)
if $saveparse
file = File.join('/tmp',"site-scan_#{$$}.txt")
File.write(file, doc.to_s)
$stderr.puts "Wrote parsed input to #{file}"
end
data[:uri] = uri.to_s
subpages = Hash.new
doc.traverse do |a|
if a.name == 'script'
a_src = a['src'].to_s.strip
if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
save_events data, uri + a_src
end
end
next unless a.name == 'a'
a_href = a['href'].to_s.strip
a_text = get_link_text(a)
$stderr.puts "#{a_text.inspect} #{a_href}" if $verbose
if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
else
data[:foundation] = squash(a_text)
end
end
if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
end
a_text = a_text.downcase.strip
if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
begin
data[:license] = uri + a_href
rescue StandardError
data[:license] = a_href
end
end
%w(thanks security sponsorship privacy).each do |check|
if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
begin
data[check.to_sym] = uri + a_href
rescue StandardError
data[check.to_sym] = a_href
end
end
end
unless a_href =~ %r{^(
begin
if a_href =~ %r{^https?://}
site2 = URI.parse(a_href.gsub(' ','%20').gsub('|', '%7C'))
else
site2 = URI.join(site,a_href.gsub(' ','%20').gsub('|', '%7C'))
end
if site2.host == uri.host and site2.path.size > 2
subpages[site2.to_s] = a
end
rescue StandardError => e
if show_anyway or !a_href.include?('fineract.gateway.scarf.sh/{version}')
$stderr.puts "#{id}: Bad a_href #{a_href} #{e}"
end
end
end
end
doc.traverse do |node|
next unless node.is_a?(Nokogiri::XML::Text)
txt = squash(node.text)
if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
txt =~ /are trademarks of [Tt]he Apache Software/
t, p = getText(txt, node)
data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
data[:tradeparent] = p if p
end
if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node)
data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/, '\1').strip
data[:copyparent] = p if p
end
if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
t, _p = getText(txt, node, / is an effort undergoing/)
data[:disclaimer] = t
end
end
hasdisclaimer = 0
nodisclaimer = []
subpages.each do |subpage, anchor|
if podling
begin
uri, response, status = $cache.get(subpage)
if uri&.to_s == subpage or uri&.to_s == subpage + '/'
puts "#{id} #{uri} #{status}"
else
puts "#{id} #{subpage} => #{uri} #{status}"
end
unless status == 'error'
if response =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
hasdisclaimer += 1
else
nodisclaimer << subpage
end
else
unless %w(nlpcraft).include? id
$stderr.puts "#{id} #{subpage} => #{uri} #{status} '#{anchor.text.strip}'"
end
end
rescue URI::InvalidURIError
end
end
end
if nodisclaimer.size > 0
data[:disclaimers] = [hasdisclaimer, nodisclaimer]
end
data[:downloads] = subpages.select{|k,_v| k =~ %r{download|release|install|dlcdn\.apache\.org|dyn/closer}i}
data[:image] = ASF::SiteImage.find(id)
if $skipresourcecheck
data[:resources] = 'Not checked'
else
cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
out, err, status = exec_with_timeout(cmd, 60)
if status
ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
resources = ext_urls.values.sum
data[:resources] = "Found #{resources} external resources: #{ext_urls}"
else
data[:resources] = err
end
end
return data
end