in src/morphological_filtering.py [0:0]
def target_filter(self, source_input: str, target_input: str, source_output: str, target_output: str, gender: str) \
-> Tuple[int, int]:
"""
Run morphological target filtering for a parallel dataset.
:param source_input: Name of the source file for the input parallel data.
:param target_input: Name of the target file for the input parallel data; to be checked using target filtering.
:param source_output: Name of the source output file after target filtering.
:param target_output: Name of the target output file after target filtering.
:param gender: Indicated gender to filter the data for.
:return: Total count of sentences in the file, and count of sentences kept after filtering.
"""
count_total, count_kept = 0, 0
with open(source_input, 'r') as in_src, open(target_input, 'r') as in_trg, \
open(source_output, 'w') as out_src, open(target_output, 'w') as out_trg:
for src_line, trg_line in zip_longest(in_src, in_trg):
count_total += 1
if count_total % 10000 == 0:
sys.stderr.write(f'Processing line {count_total} from {target_input}\n')
# heuristic: ignore translations that just repeat same tokens multiple times,
# based on the number of tokens in translation
if len(src_line.split()) * 2 < len(trg_line.split()):
continue
# only keep the sentence pair if the target sentence matches the indicated gender
if self._matches_gender(trg_line, gender):
count_kept += 1
out_src.write(src_line)
out_trg.write(trg_line)
return count_total, count_kept