def _pre_process()

in simulation/decai/simulation/data/news_data_loader.py [0:0]


    def _pre_process(self, news_articles: Collection[News], train_size: int, test_size: int) -> \
            Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
        self._logger.info("Getting features for %d articles.", len(news_articles))
        # Only use binary features.
        ngram_range = (2, 2)
        # Don't use IDF because we need integer features.
        t = TfidfVectorizer(max_features=1000, ngram_range=ngram_range, norm=None, use_idf=False)
        test_start = len(news_articles) - test_size

        x_train = map(lambda news: news.text, itertools.islice(news_articles, train_size))
        x_test = map(lambda news: news.text, itertools.islice(news_articles, test_start, len(news_articles)))
        if self._replace_entities_enabled:
            self._logger.debug("Will replace entities.")
            x_train = self._nlp.pipe(x_train, batch_size=128)
            x_test = self._nlp.pipe(x_test, batch_size=128)
        else:
            self._logger.debug("Replacing entities is disabled.")

        x_train = map(self._pre_process_text, x_train)
        x_test = map(self._pre_process_text, x_test)

        x_train = t.fit_transform(tqdm(x_train,
                                       desc="Processing training data",
                                       total=train_size,
                                       unit_scale=True, mininterval=2,
                                       unit=" articles"
                                       )).toarray()
        x_test = t.transform(tqdm(x_test,
                                  desc="Processing testing data",
                                  total=test_size,
                                  unit_scale=True, mininterval=2,
                                  unit=" articles"
                                  )).toarray()

        y_train = np.array([news.label.value for news in itertools.islice(news_articles, train_size)], np.int8)
        y_test = np.array([news.label.value for news in itertools.islice(news_articles,
                                                                         test_start, len(news_articles))], np.int8)
        self._logger.debug("Training labels: %s", Counter(y_train))
        self._logger.debug("Test labels: %s", Counter(y_test))
        self._logger.info("Done getting features.")
        return (x_train, y_train), (x_test, y_test)