in src/chug/doc/doc_read_processor.py [0:0]
def _process_anno_pages(self, anno):
assert isinstance(anno, (list, tuple)), f"Annotation should be a list of pages"
num_pages = len(anno)
if not num_pages:
raise RuntimeError("Empty annotation. Skipping...")
# FIXME for initial behaviour we will randomly sample one of N pages
# TODO determine if we want to train in multi-page mode, use another sampling strategy?
page_indices = []
try:
if self.page_sampling == 'random':
n_wanted_pages = min(1, num_pages) # TODO increase for multi-page processing, rand start+end?
current_index = self.generator.randrange(-1, num_pages - 1)
for _ in range(n_wanted_pages):
current_index = get_next_valid_page_index(current_index, num_pages, anno)
page_indices.append(current_index)
elif self.page_sampling == 'first':
current_index = get_next_valid_page_index(-1, num_pages, anno)
page_indices.append(current_index)
elif self.page_sampling == 'all_valid':
current_index = -1
for _ in range(num_pages):
current_index = get_next_valid_page_index(current_index, num_pages, anno)
page_indices.append(current_index)
elif self.page_sampling == 'all':
page_indices = list(range(num_pages))
except RuntimeError:
pass
if not page_indices:
raise RuntimeError("No valid annotated pages. Skipping...")
text_pages = []
tokenized_text_pages = []
target_pages = []
for current_index in page_indices:
# FIXME currently encoding each page separately with own start/end tokens.
# For multi-age should consider encoding in one sequence w/ page-break tokens.
anno_page = anno[current_index]
if 'lines' in anno_page:
# Two supported formats right now
# {
# 'pages': [
# {
# 'text': [], # these are lines
# 'bbox': [],
# }
# ]
# }
#
# OR
#
# {
# 'pages': [
# {
# 'lines': {
# 'text': [],
# 'bbox': [],
# },
# 'words': {
# 'text': [],
# 'bbox': [],
# }
# }
# ]
# }
#
#
anno_page = anno_page['lines']
# Currently page text is created by concatenating lines of text with a CR line break
# Additions could involve:
# * using different line-break tokens between lines
# * using word-level bbox anno information to mask works and construct partial lines
# * group lines into blocks (or use block annos) and treat blocks / paragraphs of text and
if not anno_page["text"]:
raise RuntimeError("No text on page, skipping sample...")
text = self.line_break.join(anno_page["text"])
# FIXME cleanup, split process and decode for more flexibility
# tokenize w/ and generate training target if enabled
if self.text_process_fn is not None:
processed = self.text_process_fn(text)
assert self.text_input_name in processed, \
f"Text input name '{self.text_input_name}' not found in processed sample."
tokenized_text_pages.append(processed[self.text_input_name])
if self.text_target_name in processed:
target_pages.append(processed[self.text_target_name])
else:
if self.text_target_feat is not None:
assert False, f"Expected a text target named '{self.text_target_name}' in processed sample."
else:
# FIXME warn assert that target not supported w/o text preprocessing?
tokenized_text_pages.append(text)
text_pages.append(anno_page["text"]) # unencoded text added as lines
gt_parse = {
'num_pages': num_pages, # total # of pages in doc
'page_indices': page_indices, # page indices sampled
'page_text': text_pages, # text of sampled page indices pages[].lines[]
}
output = {
self.text_input_name: tokenized_text_pages,
'_parse': gt_parse,
}
if target_pages:
output[self.text_target_name] = target_pages
return output