in ocr/utils/ngram_dataset.py [0:0]
def _get_data(self):
ngrams = []
for i in range(len(self.passage_ds)):
noisy_text, text = self.passage_ds[i]
noisy_text_arr, text_arr = noisy_text.split(" "), text.split(" ")
# Heuristics
noisy_text_arr = self._separate_word_breaks(noisy_text_arr)
text_arr = self._separate_word_breaks(text_arr)
noisy_text_arr = self._remove_empty_words(noisy_text_arr)
text_arr = self._remove_empty_words(text_arr)
for j in range(len(noisy_text_arr)):
pre_values_j = self._get_n_grams(noisy_text_arr, j, pre=True, n=3)
post_values_j = self._get_n_grams(noisy_text_arr, j, pre=False, n=3)
for k in range(len(text_arr)):
pre_values_k = self._get_n_grams(text_arr, k, pre=True, n=3)
post_values_k = self._get_n_grams(text_arr, k, pre=False, n=3)
if self._is_ngram_similar(pre_values_j, pre_values_k) and self._is_ngram_similar(post_values_j, post_values_k):
noisy_value = noisy_text_arr[j]
actual_value = text_arr[k]
pre_values = self._get_n_grams(text_arr, k, pre=True, n=self.n)
post_values = self._get_n_grams(text_arr, k, pre=False, n=self.n)
if self.output_type == "word":
ngrams.append([pre_values, post_values, noisy_value, actual_value])
elif self.output_type == "character":
pre_values = [str(a) for a in pre_values]
post_values = [str(a) for a in post_values]
noisy_full_string = " ".join(pre_values) + " " + noisy_value + " " + " ".join(post_values)
actual_full_string = " ".join(pre_values) + " " + actual_value + " " + " ".join(post_values)
noisy_index = len(" ".join(pre_values)) + 1
for c in range(len(noisy_value)):
idx = c + noisy_index
new_pre_values = actual_full_string [idx-self.n:idx]
new_post_values = actual_full_string [idx+1:idx+self.n + 1]
new_noisy_values = noisy_full_string[idx]
new_actual_values = actual_full_string[idx]
ngrams.append([new_pre_values, new_post_values, new_noisy_values, new_actual_values])
return ngrams