parse

in tools/site-scan.rb [68:260]


def parse(id, site, name, podling=false)
  show_anyway = Time.now.gmtime.strftime('%H') == '08' 
  data = {}
  
  site.sub!(%r{^http:},'https:')
  SiteStandards::COMMON_CHECKS.each_key do |k|
    data[k.to_sym] = nil
  end
  data[:display_name] = name
  data[:uri] = site
  uri = URI.parse(site)
  begin
    Socket.getaddrinfo(uri.host, uri.scheme)
  rescue SocketError => se
    data[:errors] = se.message
    return data
  end
  begin
    uri, response, status = $cache.get(site.to_s)
  rescue IOError => ioe
    data[:errors] = ioe.message
    return data
  end
  puts "#{id} #{uri} #{status}"
  
  if response.respond_to? :code and response.code =~ /^[45]/
    data[:errors] = "cache.get(#{site}) error code #{response.code}"
    return data
  end
  doc = Nokogiri::HTML(response)
  if $saveparse
    file = File.join('/tmp',"site-scan_#{$$}.txt")
    File.write(file, doc.to_s)
    $stderr.puts "Wrote parsed input to #{file}"
  end
  data[:uri] = uri.to_s

  subpages = Hash.new
  
  
  doc.traverse do |a|

    if a.name == 'script'
      a_src = a['src'].to_s.strip
      if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
        save_events data, uri + a_src
      end
    end

    next unless a.name == 'a'

    
    a_href = a['href'].to_s.strip
    a_text = get_link_text(a) 
    $stderr.puts "#{a_text.inspect} #{a_href}" if $verbose

    
    if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
      img = a.at('img')
      if img
        
        data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
      else
        data[:foundation] = squash(a_text)
      end
    end

    if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
      
      save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
    end

    
    a_text = a_text.downcase.strip 
    
    if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
        (a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
      begin
        data[:license] = uri + a_href
      rescue StandardError
        data[:license] = a_href
      end
    end

    %w(thanks security sponsorship privacy).each do |check|
      if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
        begin
          data[check.to_sym] = uri + a_href
        rescue StandardError
          data[check.to_sym] = a_href
        end
      end
    end
    unless a_href =~ %r{^(
      begin
        if a_href =~ %r{^https?://} 
          site2 = URI.parse(a_href.gsub(' ','%20').gsub('|', '%7C')) 
        else
          site2 = URI.join(site,a_href.gsub(' ','%20').gsub('|', '%7C')) 
        end
        if site2.host == uri.host and site2.path.size > 2
          subpages[site2.to_s] = a
        end
      rescue StandardError => e
        if show_anyway or !a_href.include?('fineract.gateway.scarf.sh/{version}') 
          $stderr.puts "#{id}: Bad a_href #{a_href} #{e}"
        end
      end
    end
  end

  
  doc.traverse do |node|
    next unless node.is_a?(Nokogiri::XML::Text)
    txt = squash(node.text)
    
    if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
        txt =~ /are trademarks of [Tt]he Apache Software/
      t, p = getText(txt, node)
      
      data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
      data[:tradeparent] = p if p
    end
    if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
      t, p = getText(txt, node)
      
      data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/, '\1').strip
      data[:copyparent] = p if p
    end
    
    if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
      t, _p = getText(txt, node, / is an effort undergoing/)
      data[:disclaimer] = t
    end
  end

  
  hasdisclaimer = 0
  nodisclaimer = []
  subpages.each do |subpage, anchor|
    if podling
      begin
        uri, response, status = $cache.get(subpage)
        if uri&.to_s == subpage or uri&.to_s == subpage + '/'
          puts "#{id} #{uri} #{status}"
        else
          puts "#{id} #{subpage} => #{uri} #{status}"
        end
        unless status == 'error'
          if response =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
            hasdisclaimer += 1
          else
            nodisclaimer << subpage
          end
        else
          unless %w(nlpcraft).include? id 
            $stderr.puts "#{id} #{subpage} => #{uri} #{status} '#{anchor.text.strip}'"
          end
        end
      rescue URI::InvalidURIError
        
      end
    end
  end
  if nodisclaimer.size > 0
    data[:disclaimers] = [hasdisclaimer, nodisclaimer]
  end
  
  data[:downloads] = subpages.select{|k,_v| k =~ %r{download|release|install|dlcdn\.apache\.org|dyn/closer}i}

  
  data[:image] = ASF::SiteImage.find(id)

  
  if $skipresourcecheck
    data[:resources] = 'Not checked'
  else
    cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
    out, err, status = exec_with_timeout(cmd, 60)
    if status
      ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
      resources = ext_urls.values.sum
      data[:resources] = "Found #{resources} external resources: #{ext_urls}"
    else
      data[:resources] = err
    end
  end

  
  
  return data
end