def tokenize()

in data.py [0:0]


    def tokenize(self, path):
        print("tokenizing " + path)
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Tokenize file content
        with open(path, "r", encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = self._split_line(line)
                if self.include_eos:
                    words += ["<eos>"]
                tokens += len(words)
        ids = torch.IntTensor(tokens)
        with open(path, "r", encoding="utf8") as f:
            token = 0
            for line in f:
                words = self._split_line(line)
                if self.include_eos:
                    words += ["<eos>"]
                for word in words:
                    ids[token] = self.dictionary.getidx(word)
                    token += 1
        return ids