recipes/self_training/pseudo_labeling/synthetic_lexicon_utils.py (42 lines of code) (raw):

from __future__ import absolute_import, division, print_function, unicode_literals import itertools class LexiconEntry(object): """ A particular word in the Lexicon and its candidate spellings, sorted by """ def __init__(self, word, sorted_spellings): self.word = word self.sorted_spellings = sorted_spellings def add_spelling(self, spelling): self.sorted_spellings.append(spelling) def combine_entries(self, other): # Zip up sorted spellings new_spellings = [] for this, that in itertools.zip_longest( self.sorted_spellings, other.sorted_spellings ): if this == that: new_spellings.append(this) else: if this: new_spellings.append(this) if that: new_spellings.append(that) self.sorted_spellings = new_spellings def write_spellings_to_file(spellings, outfile): """ Writes an array of Spellings to a file in Lexicon format """ sorted_spellings = sorted(spellings, key=lambda spelling: spelling.word) with open(outfile, "w") as o: for entry in sorted_spellings: for spelling in entry.sorted_spellings: o.write(entry.word.strip() + " " + " ".join(spelling).strip()) o.write("\n") def read_spellings_from_file(infile): spellings = {} # maps string to LexiconEntry with open(infile, "r") as infile: for line in infile: s_idx = line.find(" ") word = line[0:s_idx].strip() spelling = line[s_idx + 1 :].strip().split(" ") if word not in spellings: spellings[word] = LexiconEntry(word, []) spellings[word].add_spelling(spelling) out = [] for key in sorted(spellings.keys()): out.append(spellings[key]) return out