recipes/lexicon_free/utilities/utils.py (65 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. """ from __future__ import absolute_import, division, print_function, unicode_literals import re import numpy EOS = "</s>" def convert_words_to_letters_asg_rep2(fin_name, fout_name): with open(fin_name, "r") as fin, open(fout_name, "w") as fout: for line in fin: words = line.strip().split(" ") for word in words: word = re.sub("[^a-z'.]+", "", word) if len(word) == 0: continue new_word = transform_asg(word) + "|" fout.write(" ".join(list(new_word)) + " ") fout.write("\n") def transform_asg(word): if word == "": return "" new_word = word[0] prev = word[0] repetition = 0 for letter in word[1:]: if letter == prev: repetition += 1 else: if repetition != 0: new_word += "1" if repetition == 1 else "2" repetition = 0 new_word += letter prev = letter if repetition != 0: new_word += "1" if repetition == 1 else "2" return new_word def transform_asg_back(word): new_word = "" for letter in word: if letter == "|": continue if letter == "1": new_word += new_word[-1] elif letter == "2": new_word += new_word[-1] + new_word[-1] else: new_word += letter return new_word def prepare_vocabs(path): # read dictionary of words with open(path, "r") as f: words = f.readline().strip().split(" ") words = [re.sub("[^a-z'.]+", "", word) for word in words] known_words = set(list(map(lambda x: transform_asg(x) + "|", words))) - {""} words.append("</s>") known_words_original = set(words) - {""} known_words_original = numpy.array(list(known_words_original)) return known_words, known_words_original def prepare_vocabs_convlm(path): # read dictionary of words words = [] with open(path, "r") as f: for line in f: word = line.strip().split(" ")[0] words.append(re.sub("[^a-z'.]+", "", word)) known_words = set(list(map(lambda x: transform_asg(x) + "|", words))) - {""} words.append("</s>") known_words_original = set(words) - {""} known_words_original = numpy.array(list(known_words_original)) return known_words, known_words_original