in src/source_filter.py [0:0]
def main():
# read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('--input', '-i', type=str, required=True,
help='Input file to be source filtered. Should contain one sentence per line.')
parser.add_argument('--feminine-output', '-f', type=str, required=False, default=None,
help='Output file for feminine-specific sentences. Default: `[input].fem`.')
parser.add_argument('--masculine-output', '-m', type=str, required=False, default=None,
help='Output file for masculine-specific sentences. Default: `[input].msc`.')
args = parser.parse_args()
feminine_output, masculine_output = args.feminine_output, args.masculine_output
if feminine_output is None:
feminine_output = f'{args.input}.{FEM_LABEL}'
if masculine_output is None:
masculine_output = f'{args.input}.{MSC_LABEL}'
# for each sentence in the input file, check if it is gender-specific and if so output to the relevant file.
# we define feminine-specific sentences as sentences containing at least one feminine pronoun and no masculine
# words; masculine-specific is defined similarly.
count_fem, count_msc, count_total = 0, 0, 0
with open(args.input, 'r') as infile, open(feminine_output, 'w') as fem_out, open(masculine_output, 'w') as msc_out:
for line in infile:
count_total += 1
if count_total % 10000 == 0:
sys.stderr.write(f'Processing line {count_total} from {args.input}\n')
# for efficiency, skip very long lines
if len(line) > 1000:
continue
# note that lines classified as "other" are ignored
gender = _get_gender(line)
if gender == FEM_LABEL:
count_fem += 1
fem_out.write(line)
elif gender == MSC_LABEL:
count_msc += 1
msc_out.write(line)
sys.stderr.write(f'Read {count_total} lines from {args.input}\n')
sys.stderr.write(f'Wrote {count_fem} feminine lines to {feminine_output}\n')
sys.stderr.write(f'Wrote {count_msc} masculine lines to {masculine_output}\n')