www/members/page-scanner.cgi (72 lines of code) (raw):
#!/usr/bin/env ruby
PAGETITLE = "ASF Page Asset Checker - ALPHA" # Wvisible:sites
# very rudimentary page asset checker - shows references to non-ASF assets
require 'open3'
require_relative '../../tools/asf-site-check'
# usage: whimsy.apache.org/members/page-scanner?url=http://apache.org/
print "Content-type: text/plain; charset=UTF-8\r\n\r\n"
DIVIDER=' <= '
qs = ENV['QUERY_STRING']
url = option = nil
if qs =~ %r{^url=(https?://[^&]+)(?:&(.+))?}
url = $1
option = $2
elsif qs =~ %r{^host=([a-z0-9-]+)(?:&(.+))?$}
url = "https://#{$1}.apache.org/"
option = $2
end
if url
# we only want full URLs
option = 'allref' unless %w{all showurl}.include? option
puts <<~EOD
** ALPHA CODE **
Checking the page: #{url}
Using option: #{option}
The following references were found to hosts other than apache.org, openoffice.org and apachecon.com
The first column shows if the host is recognised as being under ASF control according to
https://privacy.apache.org/policies/asf-domains
Note: the script does not yet take account of sites with whom we have a DPA (Data Processing Agreement),
so it may show some legitimate references
======
EOD
cmd = ['node', '/srv/whimsy/tools/scan-page.js', url, option]
out, err, status = Open3.capture3(*cmd)
if status.success?
if out == ''
puts "No external references found"
else
puts "Top-level references:"
end
extras = Hash.new {|h,k| h[k] = Hash.new}
out.split(%r{\n+}).each do |url|
if url.start_with?('ERROR') or url.start_with?('WARN') # console error message (e.g. CSP)
puts url
else
p1, p2 = url.split(DIVIDER)
if p2
extras[p2][p1]=1
else
print ASFDOMAIN.asfurl?(url) ? 'OK ' : 'NO '
puts url
end
end
end
if extras.size > 0
puts ""
puts "Transitive references:"
extras.each do |k, v|
puts "" #separator
puts "Loaded by: "+k
v.each do |url,_|
print ASFDOMAIN.asfurl?(url) ? 'OK ' : 'NO '
puts url
end
end
end
else
puts err.scan(/^Error:.+/).first || err # Show only the Error line if present
end
print "=====\n"
else
print "Expecting: ?url=http://.../[&showurl] (or ?host=abcd => ?url=https://abcd.apache.org/\n"
end