def _get_data()

in ocr/utils/noisy_forms_dataset.py [0:0]


    def _get_data(self):
        '''
        Generates a noisy text using the noise_source_transform then organises the data from multiple lines 
        into a single form (to keep the context of the form consistent).
        
        Returns
        -------
        train_data: [(str, str)]
            Contains a list of tuples that contains a two passages that are the same but one is noisy.

        test_data: [(str, str)]
            Contains a list of tuples that contains a two passages that are the same but one is noisy. This
            list of tuples contains independent samples compared to train_data.
        '''
        train_data = []
        test_data = []
                
        for idx_form in range(len(self.iam_dataset_form)):
            print("{}/{}".format(idx_form, len(self.iam_dataset_form)))
            _, form_text = self.iam_dataset_form[idx_form]
            form_text = form_text[0].replace("\n", " ")

            _, full_form_text = self.iam_dataset_form[idx_form]
            full_form_text = full_form_text[0].replace("\n", " ")

            lines_in_form = []
            for idx_line in range(len(self.iam_dataset_line)):
                # Iterates through every line data to check if it's within the form.
                image, line_text = self.iam_dataset_line[idx_line]
                line_text = line_text[0]
                
                if self._is_line_in_form(line_text, form_text):
                    prob = self.noise_source_transform(image, line_text)
                    predicted_text = self.topK_decode(np.argmax(prob, axis=2))[0]
                    lines_in_form.append(predicted_text)
                    form_text = form_text.replace(line_text, "")

            predicted_form_text = ' '.join(lines_in_form)
            if len(predicted_text) > 500:
                import pdb; pdb.set_trace();                        

            if self.seed[idx_form] < self.train_size:
                train_data.append([predicted_form_text, full_form_text])
            else:
                test_data.append([predicted_form_text, full_form_text])

        return train_data, test_data