in torchmoji/create_vocab.py [0:0]
def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None):
""" Populates the master vocabulary using all vocabularies found in the
given path. Vocabularies should be named *.npz. Expects the
vocabularies to be numpy arrays with counts. Normalizes the counts
and combines them.
# Arguments:
vocab_path: Path containing vocabularies to be combined.
min_words: Minimum amount of occurences a word must have in order
to be included in the master vocabulary.
force_appearance: Optional vocabulary filename that will be added
to the master vocabulary no matter what. This vocabulary must
be present in vocab_path.
"""
paths = glob.glob(vocab_path + '*.npz')
sizes = {path: 0 for path in paths}
dicts = {path: {} for path in paths}
# set up and get sizes of individual dictionaries
for path in paths:
np_data = np.load(path)['data']
for entry in np_data:
word, count = entry
if count < min_words:
continue
if is_special_token(word):
continue
dicts[path][word] = count
sizes[path] = sum(dicts[path].values())
print('Overall word count for {} -> {}'.format(path, sizes[path]))
print('Overall word number for {} -> {}'.format(path, len(dicts[path])))
vocab_of_max_size = max(sizes, key=sizes.get)
max_size = sizes[vocab_of_max_size]
print('Min: {}, {}, {}'.format(sizes, vocab_of_max_size, max_size))
# can force one vocabulary to always be present
if force_appearance is not None:
force_appearance_path = [p for p in paths if force_appearance in p][0]
force_appearance_vocab = deepcopy(dicts[force_appearance_path])
print(force_appearance_path)
else:
force_appearance_path, force_appearance_vocab = None, None
# normalize word counts before inserting into master dict
for path in paths:
normalization_factor = max_size / sizes[path]
print('Norm factor for path {} -> {}'.format(path, normalization_factor))
for word in dicts[path]:
if is_special_token(word):
print("SPECIAL - ", word)
continue
normalized_count = dicts[path][word] * normalization_factor
# can force one vocabulary to always be present
if force_appearance_vocab is not None:
try:
force_word_count = force_appearance_vocab[word]
except KeyError:
continue
#if force_word_count < 5:
#continue
if word in self.master_vocab:
self.master_vocab[word] += normalized_count
else:
self.master_vocab[word] = normalized_count
print('Size of master_dict {}'.format(len(self.master_vocab)))
print("Hashes for master dict: {}".format(
len([w for w in self.master_vocab if '#' in w[0]])))