in simulation/decai/simulation/data/news_data_loader.py [0:0]
def _pre_process(self, news_articles: Collection[News], train_size: int, test_size: int) -> \
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
self._logger.info("Getting features for %d articles.", len(news_articles))
# Only use binary features.
ngram_range = (2, 2)
# Don't use IDF because we need integer features.
t = TfidfVectorizer(max_features=1000, ngram_range=ngram_range, norm=None, use_idf=False)
test_start = len(news_articles) - test_size
x_train = map(lambda news: news.text, itertools.islice(news_articles, train_size))
x_test = map(lambda news: news.text, itertools.islice(news_articles, test_start, len(news_articles)))
if self._replace_entities_enabled:
self._logger.debug("Will replace entities.")
x_train = self._nlp.pipe(x_train, batch_size=128)
x_test = self._nlp.pipe(x_test, batch_size=128)
else:
self._logger.debug("Replacing entities is disabled.")
x_train = map(self._pre_process_text, x_train)
x_test = map(self._pre_process_text, x_test)
x_train = t.fit_transform(tqdm(x_train,
desc="Processing training data",
total=train_size,
unit_scale=True, mininterval=2,
unit=" articles"
)).toarray()
x_test = t.transform(tqdm(x_test,
desc="Processing testing data",
total=test_size,
unit_scale=True, mininterval=2,
unit=" articles"
)).toarray()
y_train = np.array([news.label.value for news in itertools.islice(news_articles, train_size)], np.int8)
y_test = np.array([news.label.value for news in itertools.islice(news_articles,
test_start, len(news_articles))], np.int8)
self._logger.debug("Training labels: %s", Counter(y_train))
self._logger.debug("Test labels: %s", Counter(y_test))
self._logger.info("Done getting features.")
return (x_train, y_train), (x_test, y_test)