def main()

in src/source_filter.py [0:0]


def main():
    # read in arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', type=str, required=True,
                        help='Input file to be source filtered. Should contain one sentence per line.')
    parser.add_argument('--feminine-output', '-f', type=str, required=False, default=None,
                        help='Output file for feminine-specific sentences. Default: `[input].fem`.')
    parser.add_argument('--masculine-output', '-m', type=str, required=False, default=None,
                        help='Output file for masculine-specific sentences. Default: `[input].msc`.')
    args = parser.parse_args()
    feminine_output, masculine_output = args.feminine_output, args.masculine_output
    if feminine_output is None:
        feminine_output = f'{args.input}.{FEM_LABEL}'
    if masculine_output is None:
        masculine_output = f'{args.input}.{MSC_LABEL}'

    # for each sentence in the input file, check if it is gender-specific and if so output to the relevant file.
    # we define feminine-specific sentences as sentences containing at least one feminine pronoun and no masculine
    # words; masculine-specific is defined similarly.
    count_fem, count_msc, count_total = 0, 0, 0
    with open(args.input, 'r') as infile, open(feminine_output, 'w') as fem_out, open(masculine_output, 'w') as msc_out:
        for line in infile:
            count_total += 1
            if count_total % 10000 == 0:
                sys.stderr.write(f'Processing line {count_total} from {args.input}\n')
            # for efficiency, skip very long lines
            if len(line) > 1000:
                continue
            # note that lines classified as "other" are ignored
            gender = _get_gender(line)
            if gender == FEM_LABEL:
                count_fem += 1
                fem_out.write(line)
            elif gender == MSC_LABEL:
                count_msc += 1
                msc_out.write(line)

    sys.stderr.write(f'Read {count_total} lines from {args.input}\n')
    sys.stderr.write(f'Wrote {count_fem} feminine lines to {feminine_output}\n')
    sys.stderr.write(f'Wrote {count_msc} masculine lines to {masculine_output}\n')