in www/secretary/iclaparser.rb [152:295]
def self.parse(path)
data = {}
metadata = {}
data[:_meta] = metadata
metadata[:dataSource] = {}
freetext = {}
debug = {}
begin
reader = PDF::Reader.new(path)
%w(pdf_version info metadata page_count).each do |i|
val = reader.public_send(i)
if val.instance_of? String
metadata[i] = val.encode('utf-8', 'utf-8', :invalid => :replace)
elsif val.instance_of? Hash
metadata[i] = val.transform_values {|v| v.encode('utf-8', 'utf-8', :invalid => :replace)}
else
metadata[i] = val.class
end
end
reader.objects.each do |_k, v|
type = v[:Type] rescue nil
subtype = v[:Subtype] rescue nil
if type == :Annot
if subtype == :FreeText
rect = v[:Rect]
if rect.is_a?(PDF::Reader::Reference)
rect = reader.objects.deref(rect)
end
if rect.is_a?(Array)
contents = v[:Contents]
if contents and contents.length > 0 and contents != "\x14"
id = rect.inspect + contents
freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]}
metadata[:dataSource]['FreeText'] = true
end
else
puts "warn: #{contents} Rect is #{rect.class} in #{path}"
end
else
key = v[:T]
if key
val = v[:V].to_s
if val
val = encode(val)
if val.length > 0
ckey = canon_field_name(key)
if ckey == :FamilyFirst
data[ckey] = %w(Yes On).include? val
else
data[ckey] = val
end
end
metadata[:dataSource]['Form'] = true
end
end
end
elsif subtype == :Widget
key = v[:T]
val = v[:V].to_s
if val
if val.length > 0
data[canon_field_name(key)] = val
end
end
else
next if [:Catalog, :Font, :FontDescriptor].include? type
end
end
if freetext.size > 0
data[:text] = []
how_close = 3
freetext.values.
sort_by {|e| -e[:y] }.
slice_when {|i, j| (i[:y] - j[:y]) > how_close}.
each do |k|
data[:text] << k.
sort_by {|l| l[:x]}.
map {|v| v[:Contents]}.join(', ')
end
end
if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3)
page1 = nil
fontdict = {}
receiver = Receiver.new(fontdict)
reader.pages.each do |page|
page.fonts.each do |label, font|
fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
end
page.walk(receiver)
page1 ||= page.text
end
text = receiver.get_text()
lines = receiver.get_lines()
if text.length > 3
metadata[:dataSource]['Text'] = true
data[:text] = text
else
page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
if i == 1
metadata[:dataSource]['Page'] = true
form = lump.slice_before(/^\S/).first
form.slice_before(/^\s+.+:/).each do |lines|
line = lines.map {|l| l.sub(/^[ _]+/, '').sub(/[ _]+$/, '')}.select {|l| l.length > 0}.join(',')
case line
when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
data[canon_field_name($1)] = $2 unless $2 == ',' or $2 == ''
else
data[:unmatched] ||= []
data[:unmatched] << line
end
end
end
end
end
end
rescue StandardError => e
data[:error] = "Error processing #{path} => #{e.inspect}\n#{e.backtrace.join("\n")}"
end
data[:debug] = debug
data
end