in torchbenchmark/util/torchtext_legacy/vocab.py [0:0]
def __init__(self, counter, max_size=None, min_freq=1, specials=('<unk>', '<pad>'),
vectors=None, unk_init=None, vectors_cache=None, specials_first=True):
"""Create a Vocab object from a collections.Counter.
Args:
counter: collections.Counter object holding the frequencies of
each value found in the data.
max_size: The maximum size of the vocabulary, or None for no
maximum. Default: None.
min_freq: The minimum frequency needed to include a token in the
vocabulary. Values less than 1 will be set to 1. Default: 1.
specials: The list of special tokens (e.g., padding or eos) that
will be prepended to the vocabulary. Default: ['<unk'>, '<pad>']
vectors: One of either the available pretrained vectors
or custom pretrained vectors (see Vocab.load_vectors);
or a list of aforementioned vectors
unk_init (callback): by default, initialize out-of-vocabulary word vectors
to zero vectors; can be any function that takes in a Tensor and
returns a Tensor of the same size. Default: 'torch.zeros'
vectors_cache: directory for cached vectors. Default: '.vector_cache'
specials_first: Whether to add special tokens into the vocabulary at first.
If it is False, they are added into the vocabulary at last.
Default: True.
"""
self.freqs = counter
counter = counter.copy()
min_freq = max(min_freq, 1)
self.itos = list()
self.unk_index = None
if specials_first:
self.itos = list(specials)
# only extend max size if specials are prepended
max_size = None if max_size is None else max_size + len(specials)
# frequencies of special tokens are not counted when building vocabulary
# in frequency order
for tok in specials:
del counter[tok]
# sort by frequency, then alphabetically
words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
for word, freq in words_and_frequencies:
if freq < min_freq or len(self.itos) == max_size:
break
self.itos.append(word)
if Vocab.UNK in specials: # hard-coded for now
unk_index = specials.index(Vocab.UNK) # position in list
# account for ordering of specials, set variable
self.unk_index = unk_index if specials_first else len(self.itos) + unk_index
self.stoi = defaultdict(self._default_unk_index)
else:
self.stoi = defaultdict()
if not specials_first:
self.itos.extend(list(specials))
# stoi is simply a reverse dict for itos
self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
self.vectors = None
if vectors is not None:
self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
else:
assert unk_init is None and vectors_cache is None