in simulation/decai/simulation/data/news_data_loader.py [0:0]
def load_data(self, train_size: int = None, test_size: int = None) -> \
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
self._logger.info("Loading news data.")
data_folder_path = os.path.join(__file__, '../../../../training_data/news')
# Look for cached data.
file_identifier = f'news-data-{train_size}-{test_size}-replace_ents_{self._replace_entities_enabled}.npy'
base_path = Path(os.path.dirname(__file__)) / 'cached_data'
os.makedirs(base_path, exist_ok=True)
cache_paths = {
'x_train': base_path / f'x_train-{file_identifier}',
'y_train': base_path / f'y_train-{file_identifier}',
'x_test': base_path / f'x_test-{file_identifier}',
'y_test': base_path / f'y_test-{file_identifier}'
}
# Use if modified in the last day.
if all([p.exists() for p in cache_paths.values()]) and \
all([time.time() - p.stat().st_mtime < 60 * 60 * 24 for p in cache_paths.values()]):
self._logger.info("Loaded cached News data from %s.", cache_paths)
return (np.load(cache_paths['x_train']), np.load(cache_paths['y_train'])), \
(np.load(cache_paths['x_test']), np.load(cache_paths['y_test']))
data = self._load_kaggle_data(data_folder_path)
# Separate train and test data.
if train_size is None:
if test_size is None:
train_size = int(self._train_split * len(data))
else:
train_size = len(data) - test_size
if test_size is None:
test_size = len(data) - train_size
if train_size + test_size > len(data):
raise Exception("There is not enough data for the requested sizes."
f"\n data size: {len(data)}"
f"\n train size: {train_size}"
f"\n test size: {test_size}")
(x_train, y_train), (x_test, y_test) = self._pre_process(data, train_size, test_size)
np.save(cache_paths['x_train'], x_train, allow_pickle=False)
np.save(cache_paths['y_train'], y_train, allow_pickle=False)
np.save(cache_paths['x_test'], x_test, allow_pickle=False)
np.save(cache_paths['y_test'], y_test, allow_pickle=False)
self._logger.info("Done loading news data.")
return (x_train, y_train), (x_test, y_test)