def target_filter()

in src/morphological_filtering.py [0:0]


    def target_filter(self, source_input: str, target_input: str, source_output: str, target_output: str, gender: str) \
            -> Tuple[int, int]:
        """
        Run morphological target filtering for a parallel dataset.

        :param source_input: Name of the source file for the input parallel data.
        :param target_input: Name of the target file for the input parallel data; to be checked using target filtering.
        :param source_output: Name of the source output file after target filtering.
        :param target_output: Name of the target output file after target filtering.
        :param gender: Indicated gender to filter the data for.
        :return: Total count of sentences in the file, and count of sentences kept after filtering.
        """
        count_total, count_kept = 0, 0
        with open(source_input, 'r') as in_src, open(target_input, 'r') as in_trg, \
                open(source_output, 'w') as out_src, open(target_output, 'w') as out_trg:
            for src_line, trg_line in zip_longest(in_src, in_trg):
                count_total += 1
                if count_total % 10000 == 0:
                    sys.stderr.write(f'Processing line {count_total} from {target_input}\n')
                # heuristic: ignore translations that just repeat same tokens multiple times,
                # based on the number of tokens in translation
                if len(src_line.split()) * 2 < len(trg_line.split()):
                    continue
                # only keep the sentence pair if the target sentence matches the indicated gender
                if self._matches_gender(trg_line, gender):
                    count_kept += 1
                    out_src.write(src_line)
                    out_trg.write(trg_line)
        return count_total, count_kept