www/secretary/iclaparser.rb

#!/usr/bin/env ruby # DRAFT DRAFT DRAFT # DRAFT DRAFT DRAFT # DRAFT DRAFT DRAFT # DRAFT DRAFT DRAFT # DRAFT DRAFT DRAFT # # ICLA PDF parsing support # # Try to extract user text from ICLA PDFs. # The Gem is not 100% accurate in creating a text version of the page. # Also it's tricky to extract the text accurately. # So we try other methods first: # - if there is a form, return its fields # - if there are FreeText Annotations, return them in page order # - use show_text_with_positioning as that seems to be used for PDF updates # - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt require 'pdf-reader' # TODO perhaps always extract all the data types then choose the best # Should turn hash values into arrays? module ICLAParser # Process page to extract text with positioning elements # These are often used instead of providing form fields class Receiver SKIP = [ # Short elements that are not user data 'Individual Contributor', 'License Agreement', '("Agreement") V2.0', 'as "Not a Contribution."', 'inaccurate in any respect.', 'for your records.', '1. Definitions.', 'Contributions and such derivative works.', 'litigation is filed.', 'Contributions.' ] def initialize(fontdict) @texts = [] # show_text_with_positioning @lines = [] # show_text @tfs = nil # text font and size @fontdict = fontdict end # Some PDFs use show_text() multiple times in a line def begin_text_object @textobj = [] end def end_text_object @lines << @textobj.join end def set_text_font_and_size(*args) @tfs = args end def show_text(string) font = @fontdict[@tfs.first] utf8 = string_to_utf8(string, font) @textobj << utf8 end def show_text_with_positioning(*args) font = @fontdict[@tfs.first] # args are Strings (in the current font encoding) interspersed with integer spacing adjustments; only want the strings # We assume the positioning does not overlay characters so can be ignored chars = [] args.flatten.each do |arg| if arg.is_a?(String) char = string_to_utf8(arg, font) chars << char end end val = chars.join.strip len = val.length # some PDFs have the individual text in this format so skip long lines which are unlikely to be user data # Could perhaps have full list of expected text lines instead. unless len == 0 or len > 50 or SKIP.include? val @texts << val end end def get_text @texts end def get_lines @lines end def string_to_utf8(string, font) chars = [] glyphs = font.unpack(string) glyphs.each do |glyph_code| char = font.to_utf8(glyph_code) # One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9, 13, 32, 194, 160] if glyph_code == 36 and char =~ /^\t\r / char = ' ' end chars << char end chars.join end end # Standard form field names for other code to use NAME2FIELD = { 'fullname' => :FullName, 'publicname' => :PublicName, 'familyfirst' => :FamilyFirst, 'mailingaddress' => :MailingAddress, 'mailingaddress2' => :MailingAddress2, 'postaladdress' => :MailingAddress, 'postaladdress2' => :MailingAddress2, 'country' => :Country, 'formattedfield1' => :Country, # fix up bad form name 'telephone' => :Telephone, 'e-mail' => :EMail, 'preferredapacheid(s)' => :ApacheID, 'notifyproject' => :Project, 'date' => :Date, 'signature' => :Signature, } # canonicalise the names found in the PDF def self.canon_field_name(pdfname) NAME2FIELD[pdfname.gsub(' ', '').downcase] || pdfname end def self.encode(val) if val.bytes[0..1] == [254, 255] val = val.encode('utf-8', 'utf-16').strip else begin val = val.encode('utf-8').strip rescue Encoding::UndefinedConversionError val = val.encode('utf-8', 'iso-8859-1').strip end end val.gsub("\x7F", '') # Not sure where these originate end # parse the PDF def self.parse(path) data = {} metadata = {} data[:_meta] = metadata metadata[:dataSource] = {} # have we found anything freetext = {} # gather the free text details debug = {} begin reader = PDF::Reader.new(path) %w(pdf_version info metadata page_count).each do |i| # It looks like some of the values may not be UTF-8 # In particular info[:Producer] may have odd characters val = reader.public_send(i) if val.instance_of? String metadata[i] = val.encode('utf-8', 'utf-8', :invalid => :replace) elsif val.instance_of? Hash metadata[i] = val.transform_values {|v| v.encode('utf-8', 'utf-8', :invalid => :replace)} else metadata[i] = val.class end end reader.objects.each do |_k, v| type = v[:Type] rescue nil subtype = v[:Subtype] rescue nil if type == :Annot if subtype == :FreeText # These are not directly associated with forms rect = v[:Rect] # rect can be a reference. If so, it seems there may be multiple copies with different IDs but same Rect coords and contents if rect.is_a?(PDF::Reader::Reference) rect = reader.objects.deref(rect) end if rect.is_a?(Array) contents = v[:Contents] if contents and contents.length > 0 and contents != "\x14" # ignore "\x14" == ASCII DC4 # Entries may be duplicated, so use a hash to store them id = rect.inspect + contents # if the rect and contents match, then they overwrite each other freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]} metadata[:dataSource]['FreeText'] = true end else puts "warn: #{contents} Rect is #{rect.class} in #{path}" end else key = v[:T] if key val = v[:V].to_s # might be a symbol # This is a hack; should really find the font def and use that if val # debug[key] = v.inspect val = encode(val) if val.length > 0 ckey = canon_field_name(key) if ckey == :FamilyFirst # convert the value to true/false # PDFs seem to use Yes and Off; also allow for On data[ckey] = %w(Yes On).include? val # default to false else data[ckey] = val end end metadata[:dataSource]['Form'] = true end end end elsif subtype == :Widget key = v[:T] val = v[:V].to_s # might be a symbol if val # debug[key] = v.inspect if val.length > 0 data[canon_field_name(key)] = val end end else next if [:Catalog, :Font, :FontDescriptor].include? type # p [k,type,subtype,v] end end # objects if freetext.size > 0 data[:text] = [] # Sort by Y descending (down the page) and X ascending (across) # split into separate chunks if the difference in Y is more than a few points how_close = 3 freetext.values. # no need for ids any more sort_by {|e| -e[:y] }. # sort by Y desc slice_when {|i, j| (i[:y] - j[:y]) > how_close}. # gather nearby Y values in case there are multiple entries on a line each do |k| data[:text] << k. sort_by {|l| l[:x]}. # sort by X ascending map {|v| v[:Contents]}.join(', ') end end if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) # No annotations found or not useful page1 = nil # cache for page 1 fontdict = {} # Try looking for text sections instead receiver = Receiver.new(fontdict) reader.pages.each do |page| # extract the fonts (needed for conversion to utf-8) page.fonts.each do |label, font| fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font)) end page.walk(receiver) page1 ||= page.text end # pickup up the collected strings text = receiver.get_text() # p text lines = receiver.get_lines() # do we still need these? # debug[:lines] = lines if text.length > 3 metadata[:dataSource]['Text'] = true data[:text] = text else page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i| if i == 1 # starts with Full name metadata[:dataSource]['Page'] = true # drop the postamble form = lump.slice_before(/^\S/).first # split into headers form.slice_before(/^\s+.+:/).each do |lines| # trim leading and trailing blanks and underscores and drop blank lines line = lines.map {|l| l.sub(/^[ _]+/, '').sub(/[ _]+$/, '')}.select {|l| l.length > 0}.join(',') case line when /^\s*(?:$optional$ )?(.+):\s+(.*)/ data[canon_field_name($1)] = $2 unless $2 == ',' or $2 == '' # empty line else data[:unmatched] ||= [] data[:unmatched] << line end end end end end end rescue StandardError => e data[:error] = "Error processing #{path} => #{e.inspect}\n#{e.backtrace.join("\n")}" end data[:debug] = debug # TODO attempt to classify data[:text] items? data end end if __FILE__ == $0 require 'pp' pp ICLAParser.parse(ARGV.first) end

www/secretary/iclaparser.rb (227 lines of code) (raw):