def _get_data()

in ocr/utils/ngram_dataset.py [0:0]


    def _get_data(self):
        ngrams = []
        for i in range(len(self.passage_ds)):
            noisy_text, text = self.passage_ds[i]
            noisy_text_arr, text_arr = noisy_text.split(" "), text.split(" ")

            # Heuristics
            noisy_text_arr = self._separate_word_breaks(noisy_text_arr)
            text_arr = self._separate_word_breaks(text_arr)
            
            noisy_text_arr = self._remove_empty_words(noisy_text_arr)
            text_arr = self._remove_empty_words(text_arr)

            for j in range(len(noisy_text_arr)):
                pre_values_j = self._get_n_grams(noisy_text_arr, j, pre=True, n=3)
                post_values_j = self._get_n_grams(noisy_text_arr, j, pre=False, n=3)

                for k in range(len(text_arr)):
                    pre_values_k = self._get_n_grams(text_arr, k, pre=True, n=3)
                    post_values_k = self._get_n_grams(text_arr, k, pre=False, n=3)
                    if self._is_ngram_similar(pre_values_j, pre_values_k) and self._is_ngram_similar(post_values_j, post_values_k):
                        noisy_value = noisy_text_arr[j]
                        actual_value = text_arr[k]
                        pre_values = self._get_n_grams(text_arr, k, pre=True, n=self.n)
                        post_values = self._get_n_grams(text_arr, k, pre=False, n=self.n)
                        if self.output_type == "word":
                            ngrams.append([pre_values, post_values, noisy_value, actual_value])
                        elif self.output_type == "character":
                            pre_values = [str(a) for a in pre_values]
                            post_values = [str(a) for a in post_values]

                            noisy_full_string = " ".join(pre_values) + " " + noisy_value + " " + " ".join(post_values)
                            actual_full_string = " ".join(pre_values) + " " + actual_value + " " + " ".join(post_values)
                            noisy_index = len(" ".join(pre_values)) + 1
                            for c in range(len(noisy_value)):
                                idx = c + noisy_index
                                new_pre_values = actual_full_string [idx-self.n:idx]
                                new_post_values = actual_full_string [idx+1:idx+self.n + 1]
                                new_noisy_values = noisy_full_string[idx]
                                new_actual_values = actual_full_string[idx]
                                ngrams.append([new_pre_values, new_post_values, new_noisy_values, new_actual_values])
        return ngrams