def __init__()

in model/data_utils.py [0:0]


    def __init__(self, data_dir, cfg):
        """Load the music corpus
        Args:
            data_dir: The base folder of the preprocessed music dataset
        """
        self._vocab_path = os.path.join(data_dir, "vocab.txt")
        self._train_folder = os.path.join(data_dir, "train")
        self._valid_folder = os.path.join(data_dir, "valid")
        self._test_folder = os.path.join(data_dir, "test")
        all_tokens = []
        with open(self._vocab_path, "r") as f:
            for token in f:
                token = token.strip()
                all_tokens.append(token)
        self._vocab = BaseVocab(all_tokens)

        self._train_data = self.load_cache_data(self._train_folder)
        self._valid_data = self.load_cache_data(self._valid_folder)
        self._test_data = self.load_cache_data(self._test_folder)
        self.cfg = cfg

        # Insert start tokens
        if self.cfg.TRAIN.replace_start_with_pad:
            print("USING PAD TOKEN AS START!")
            insert_token = self._vocab.pad_id
        else:
            insert_token = self._vocab.bos_id
        self._train_data = [
            torch.from_numpy(np.insert(arr, 0, insert_token))
            for arr in self._train_data
        ]
        self._valid_data = [
            torch.from_numpy(np.insert(arr, 0, insert_token))
            for arr in self._valid_data
        ]
        self._test_data = [
            torch.from_numpy(np.insert(arr, 0, insert_token))
            for arr in self._test_data
        ]

        self._train_seq_length = np.array(
            [ele.shape[0] for ele in self._train_data], dtype=np.int32
        )
        self._valid_seq_length = np.array(
            [ele.shape[0] for ele in self._valid_data], dtype=np.int32
        )
        self._test_seq_length = np.array(
            [ele.shape[0] for ele in self._test_data], dtype=np.int32
        )
        print(
            "Loaded Data, #Samples Train/Val/Test:{}/{}/{}".format(
                len(self._train_data), len(self._valid_data), len(self._test_data)
            )
        )
        print(
            "             #Avg Length:{}/{}/{}".format(
                np.mean([len(ele) for ele in self._train_data]),
                np.mean([len(ele) for ele in self._valid_data]),
                np.mean([len(ele) for ele in self._test_data]),
            )
        )
        print(
            "             #Total Number of Valid/Test Tokens: {}/{}".format(
                (self._valid_seq_length - 1).sum(), (self._test_seq_length - 1).sum()
            )
        )
        if cfg.TRAIN.append_note_status:
            self._vocab.notes_mapping()