in pytext/fields/text_field_with_special_unk.py [0:0]
def build_vocab(self, *args, min_freq=1, **kwargs):
"""
Code is exactly same as as torchtext.legacy.data.Field.build_vocab() before the
UNKification logic. The reason super().build_vocab() cannot be called is
because the Counter object computed in torchtext.legacy.data.Field.build_vocab()
is required for UNKification and, that object cannot be recovered after
super().build_vocab() call is made.
"""
counter = Counter()
sources = []
for arg in args:
if isinstance(arg, torchtextdata.Dataset):
sources += [
getattr(arg, name)
for name, field in arg.fields.items()
if field is self
]
else:
sources.append(arg)
for data in sources:
for x in data:
if not self.sequential:
x = [x]
x = [item for item in x if not is_number(item)]
# All numbers are mapped to self.unk_num_token
try:
counter.update(x)
except TypeError:
counter.update(chain.from_iterable(x))
specials = list(
OrderedDict.fromkeys(
tok
for tok in [
self.unk_token,
self.pad_token,
self.init_token,
self.eos_token,
self.unk_num_token,
]
if tok is not None
)
)
# Special UNKification logic.
if self.unkify_func:
new_counter = Counter()
for item in counter:
new_item = item
if counter[item] < min_freq:
new_item = self.unkify_func(item)
new_counter.update([new_item] * counter[item])
counter = new_counter
self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)