pipeline/clean/tools/langid_fasttext.py (24 lines of code) (raw):

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Usage: # ./langid-fasttext.py < sents.txt > code-tab-sents.txt # # Installation: # pip3 install pybind11 fasttext --user # # Parallelize: # cat sents.txt | parallel --pipe -k -j16 --block 20M ./langid-fasttext.py > code-tab-sents.txt import argparse import os import sys import fasttext BIN = "lid.176.bin" URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/{}".format(BIN) def main(): args = parse_user_args() mpath = os.path.join(os.path.dirname(os.path.realpath(__file__)), BIN) if not os.path.exists(mpath): sys.stderr.write("Downloading model {} ...\n".format(URL)) import urllib.request urllib.request.urlretrieve(URL, mpath) model = fasttext.load_model(mpath) for line in sys.stdin: fields = line.strip().split("\t") lid = model.predict(fields[args.field]) sys.stdout.write("{}\t{}".format(lid[0][0][-2:], line)) def parse_user_args(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--field", default=0, type=int, help="text field, default: 0") return parser.parse_args() if __name__ == "__main__": main()