self.parse

in www/secretary/iclaparser.rb [152:285]


  def self.parse(path)
    data = {}
    metadata = {}
    data[:_meta] = metadata
    metadata[:dataSource] = {} 
    freetext = {} 
    debug = {}
    begin
      reader = PDF::Reader.new(path)
      %w(pdf_version info metadata page_count).each do |i|
        metadata[i] = reader.public_send(i)
      end
      reader.objects.each do |_k, v|
        type = v[:Type] rescue nil
        subtype = v[:Subtype] rescue nil

        if type == :Annot
          if subtype == :FreeText 
            rect = v[:Rect]
            
            if rect.is_a?(PDF::Reader::Reference)
              rect = reader.objects.deref(rect)
            end
            if rect.is_a?(Array)
              contents = v[:Contents]
              if contents and contents.length > 0 and contents != "\x14" 
                
                id = rect.inspect + contents 
                freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]}
                metadata[:dataSource]['FreeText'] = true
              end
            else
              puts "warn: #{contents} Rect is #{rect.class} in #{path}"
            end
          else
            key = v[:T]
            if key
              val = v[:V].to_s 
              
              if val
                debug[key] = v.inspect
                val = encode(val)
                if val.length > 0
                  ckey = canon_field_name(key)
                  if ckey == :FamilyFirst 
                    
                    data[ckey] = %w(Yes On).include? val 
                  else
                    data[ckey] = val
                  end
                end
                metadata[:dataSource]['Form'] = true
              end
            end
          end
        elsif subtype == :Widget
          key = v[:T]
          val = v[:V].to_s 
          if val
            debug[key] = v.inspect
            if val.length > 0
              data[canon_field_name(key)] = val
            end
          end
        else
          next if [:Catalog, :Font, :FontDescriptor].include? type
          
        end
      end 
      if freetext.size > 0
        data[:text] = []
        
        
        how_close = 3
        freetext.values. 
          sort_by {|e| -e[:y] }. 
          slice_when {|i, j| (i[:y] - j[:y]) > how_close}. 
          each do |k|
            data[:text] << k.
            sort_by {|l| l[:x]}. 
            map {|v| v[:Contents]}.join(", ")
          end
      end
      if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) 
        page1 = nil 
        fontdict = {}
        
        receiver = Receiver.new(fontdict)
        reader.pages.each do |page|
          
          page.fonts.each do |label, font|
            fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
          end
          page.walk(receiver)
          page1 ||= page.text
        end
        
        text = receiver.get_text()

        lines = receiver.get_lines() 
        debug[:lines] = lines
        if text.length > 3
          metadata[:dataSource]['Text'] = true
          data[:text] = text
        else
          page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
            if i == 1 
              metadata[:dataSource]['Page'] = true
              
              form = lump.slice_before(/^\S/).first
              
              form.slice_before(/^\s+.+:/).each do |lines|
                
                line = lines.map {|l| l.sub(/^[ _]+/, '').sub(/[ _]+$/, '')}.select {|l| l.length > 0}.join(',')
                case line
                  when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
                    data[canon_field_name($1)] = $2 unless $2 == ',' or $2 == '' 
                  else
                    data[:unmatched] ||= []
                    data[:unmatched] << line
                end
              end
            end
          end
        end
      end
    rescue StandardError => e
      data[:error] = "Error processing #{path} => #{e.inspect}\n#{e.backtrace.join("\n")}"
    end

    
    data
  end