in data.py [0:0]
def tokenize(self, path):
print("tokenizing " + path)
"""Tokenizes a text file."""
assert os.path.exists(path)
# Tokenize file content
with open(path, "r", encoding="utf8") as f:
tokens = 0
for line in f:
words = self._split_line(line)
if self.include_eos:
words += ["<eos>"]
tokens += len(words)
ids = torch.IntTensor(tokens)
with open(path, "r", encoding="utf8") as f:
token = 0
for line in f:
words = self._split_line(line)
if self.include_eos:
words += ["<eos>"]
for word in words:
ids[token] = self.dictionary.getidx(word)
token += 1
return ids