in ocr/utils/iam_dataset.py [0:0]
def _process_data(self):
''' Function that iterates through the downloaded xml file to gather the input images and the
corresponding output.
Returns
-------
pd.DataFrame
A pandas dataframe that contains the subject, image and output requested.
'''
image_data = []
xml_files = glob.glob(self._root + "/xml/*.xml")
print("Processing data:")
logging.info("Processing data")
for i, xml_file in enumerate(xml_files):
tree = ET.parse(xml_file)
root = tree.getroot()
height, width = int(root.attrib["height"]), int(root.attrib["width"])
for item in root.iter(self._parse_method.split("_")[0]):
# Split _ to account for only taking the base "form", "line", "word" that is available in the IAM dataset
if self._parse_method in ["form", "form_bb", "form_original"]:
image_id = item.attrib["id"]
else:
tmp_id = item.attrib["id"]
tmp_id_split = tmp_id.split("-")
image_id = os.path.join(tmp_id_split[0], tmp_id_split[0] + "-" + tmp_id_split[1], tmp_id)
image_filename = os.path.join(self._root, self._parse_method.split("_")[0], image_id + ".png")
image_arr = self._pre_process_image(image_filename)
if image_arr is None:
continue
output_data = self._get_output_data(item, height, width)
if self._parse_method == "form_bb":
image_arr, output_data = self._crop_and_resize_form_bb(item, image_arr, output_data, height, width)
image_data.append([item.attrib["id"], image_arr, output_data])
self._reporthook(i, 1, len(xml_files))
image_data = pd.DataFrame(image_data, columns=["subject", "image", "output"])
self._save_dataframe_chunks(image_data, self.image_data_file_name)
return image_data