def main()

in floresv1/scripts/indic_norm_tok.py [0:0]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--indic-nlp-path", required=True,
                        help="path to Indic NLP Library root")
    parser.add_argument("--language", required=True)
    parser.add_argument("--remove-nuktas", default=False, action="store_true")
    parser.add_argument("input", help="input file; use - for stdin")
    args = parser.parse_args()

    try:
        sys.path.extend([
            args.indic_nlp_path,
            os.path.join(args.indic_nlp_path, "src"),
        ])
        from indicnlp.tokenize import indic_tokenize
        from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    except:
        raise Exception(
            "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct"
        )

    # create normalizer
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer(
        args.language, remove_nuktas=args.remove_nuktas,
    )

    # normalize and tokenize
    for line in fileinput.input([args.input], openhook=fileinput.hook_compressed):
        line = normalizer.normalize(line.decode("utf-8"))
        line = " ".join(indic_tokenize.trivial_tokenize(line, args.language))
        sys.stdout.write(line.encode("utf-8"))