in ocr/utils/iam_dataset.py [0:0]
def _process_subjects(self, train_subject_lists = ["trainset", "validationset1", "validationset2"],
test_subject_lists = ["testset"]):
''' Function to organise the list of subjects to training and testing.
The IAM dataset provides 4 files: trainset, validationset1, validationset2, and testset each
with a list of subjects.
Parameters
----------
train_subject_lists: [str], default ["trainset", "validationset1", "validationset2"]
The filenames of the list of subjects to be used for training the model
test_subject_lists: [str], default ["testset"]
The filenames of the list of subjects to be used for testing the model
Returns
-------
train_subjects: [str]
A list of subjects used for training
test_subjects: [str]
A list of subjects used for testing
'''
train_subjects = []
test_subjects = []
for train_list in train_subject_lists:
subject_list = pd.read_csv(os.path.join(self._root, "subject", train_list+".txt"))
train_subjects.append(subject_list.values)
for test_list in test_subject_lists:
subject_list = pd.read_csv(os.path.join(self._root, "subject", test_list+".txt"))
test_subjects.append(subject_list.values)
train_subjects = np.concatenate(train_subjects)
test_subjects = np.concatenate(test_subjects)
if self._parse_method in ["form", "form_bb", "form_original"]:
# For the form method, the "subject names" do not match the ones provided
# in the file. This clause transforms the subject names to match the file.
new_train_subjects = []
for i in train_subjects:
form_subject_number = i[0].split("-")[0] + "-" + i[0].split("-")[1]
new_train_subjects.append(form_subject_number)
new_test_subjects = []
for i in test_subjects:
form_subject_number = i[0].split("-")[0] + "-" + i[0].split("-")[1]
new_test_subjects.append(form_subject_number)
train_subjects, test_subjects = new_train_subjects, new_test_subjects
return train_subjects, test_subjects