def load_data()

in simulation/decai/simulation/data/news_data_loader.py [0:0]


    def load_data(self, train_size: int = None, test_size: int = None) -> \
            Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
        self._logger.info("Loading news data.")
        data_folder_path = os.path.join(__file__, '../../../../training_data/news')

        # Look for cached data.
        file_identifier = f'news-data-{train_size}-{test_size}-replace_ents_{self._replace_entities_enabled}.npy'
        base_path = Path(os.path.dirname(__file__)) / 'cached_data'
        os.makedirs(base_path, exist_ok=True)
        cache_paths = {
            'x_train': base_path / f'x_train-{file_identifier}',
            'y_train': base_path / f'y_train-{file_identifier}',
            'x_test': base_path / f'x_test-{file_identifier}',
            'y_test': base_path / f'y_test-{file_identifier}'
        }
        # Use if modified in the last day.
        if all([p.exists() for p in cache_paths.values()]) and \
                all([time.time() - p.stat().st_mtime < 60 * 60 * 24 for p in cache_paths.values()]):
            self._logger.info("Loaded cached News data from %s.", cache_paths)
            return (np.load(cache_paths['x_train']), np.load(cache_paths['y_train'])), \
                   (np.load(cache_paths['x_test']), np.load(cache_paths['y_test']))

        data = self._load_kaggle_data(data_folder_path)

        #  Separate train and test data.
        if train_size is None:
            if test_size is None:
                train_size = int(self._train_split * len(data))
            else:
                train_size = len(data) - test_size
        if test_size is None:
            test_size = len(data) - train_size
        if train_size + test_size > len(data):
            raise Exception("There is not enough data for the requested sizes."
                            f"\n  data size: {len(data)}"
                            f"\n  train size: {train_size}"
                            f"\n  test size: {test_size}")

        (x_train, y_train), (x_test, y_test) = self._pre_process(data, train_size, test_size)
        np.save(cache_paths['x_train'], x_train, allow_pickle=False)
        np.save(cache_paths['y_train'], y_train, allow_pickle=False)
        np.save(cache_paths['x_test'], x_test, allow_pickle=False)
        np.save(cache_paths['y_test'], y_test, allow_pickle=False)
        self._logger.info("Done loading news data.")
        return (x_train, y_train), (x_test, y_test)