in src/target_filter.py [0:0]
def main():
# read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('--source-input', '-s', type=str, required=True,
help='Source input file for target-filtering. Should be parallel to `target_input` and '
'contain one sentence per line.')
parser.add_argument('--target-input', '-t', type=str, required=True,
help='Target input file for target-filtering. Should be parallel to `source_input` and '
'contain one sentence per line.')
parser.add_argument('--target-language', '-l', type=str, required=True, choices=SUPPORTED_LANGUAGES,
help='Two-letter code for target language.')
parser.add_argument('--gender', '-g', type=str, required=True, choices=SUPPORTED_GENDERS,
help="Gender to filter for on the target side.")
parser.add_argument('--source-output', '-so', type=str, required=False, default=None,
help='Output file for the source side of the target-filtered corpus. '
'Default: `[source_input].target_filtered`.')
parser.add_argument('--target-output', '-to', type=str, required=False, default=None,
help='Output file for the target side of the target_filtered corpus. '
'Default: `[target_input].target_filtered`.')
args = parser.parse_args()
source_output, target_output = args.source_output, args.target_output
if source_output is None:
source_output = f'{args.source_input}.target_filtered'
if target_output is None:
target_output = f'{args.target_input}.target_filtered'
lang = args.target_language
# for each sentence pair, check whether all morphologically gendered words in the target sentence match the
# specified gender. if so, we output the sentence pair; otherwise, we ignore it.
if lang == "de":
filterer = GermanMorphFilterer()
elif lang in ("fr", "it"):
filterer = SpacyMorphFilterer(lang=lang)
elif lang == "he":
filterer = HebrewMorphFilterer()
elif lang == "ru":
filterer = RussianMorphFilterer()
else:
raise NotImplementedError(f'Unrecognized language {lang}. Supported languages: {SUPPORTED_LANGUAGES}')
count_total, count_kept = filterer.target_filter(args.source_input, args.target_input, source_output,
target_output, args.gender)
sys.stderr.write(f'Read {count_total} lines from {args.source_input} and {args.target_input}\n')
sys.stderr.write(f'Wrote {count_kept} lines to {source_output} and {target_output} for gender {args.gender}\n')