in ocr/utils/noisy_forms_dataset.py [0:0]
def _get_data(self):
'''
Generates a noisy text using the noise_source_transform then organises the data from multiple lines
into a single form (to keep the context of the form consistent).
Returns
-------
train_data: [(str, str)]
Contains a list of tuples that contains a two passages that are the same but one is noisy.
test_data: [(str, str)]
Contains a list of tuples that contains a two passages that are the same but one is noisy. This
list of tuples contains independent samples compared to train_data.
'''
train_data = []
test_data = []
for idx_form in range(len(self.iam_dataset_form)):
print("{}/{}".format(idx_form, len(self.iam_dataset_form)))
_, form_text = self.iam_dataset_form[idx_form]
form_text = form_text[0].replace("\n", " ")
_, full_form_text = self.iam_dataset_form[idx_form]
full_form_text = full_form_text[0].replace("\n", " ")
lines_in_form = []
for idx_line in range(len(self.iam_dataset_line)):
# Iterates through every line data to check if it's within the form.
image, line_text = self.iam_dataset_line[idx_line]
line_text = line_text[0]
if self._is_line_in_form(line_text, form_text):
prob = self.noise_source_transform(image, line_text)
predicted_text = self.topK_decode(np.argmax(prob, axis=2))[0]
lines_in_form.append(predicted_text)
form_text = form_text.replace(line_text, "")
predicted_form_text = ' '.join(lines_in_form)
if len(predicted_text) > 500:
import pdb; pdb.set_trace();
if self.seed[idx_form] < self.train_size:
train_data.append([predicted_form_text, full_form_text])
else:
test_data.append([predicted_form_text, full_form_text])
return train_data, test_data