def load_data()

in simulation/decai/simulation/data/offensive_data_loader.py [0:0]


    def load_data(self, train_size: int = None, test_size: int = None) -> (Tuple, Tuple):
        self._logger.info("Loading data.")

        data_folder_path = Path(__file__,
                                '../../../../training_data/offensive/hate-speech-and-offensive-language').resolve()

        if train_size is not None and test_size is not None:
            max_num_samples = train_size + test_size
        else:
            max_num_samples = None
        data_path = data_folder_path / 'labeled_data.csv'

        if not data_path.exists():
            data_url = 'https://github.com/t-davidson/hate-speech-and-offensive-language/raw/master/data/labeled_data.csv'
            self._logger.info("Downloading data from \"%s\" to \"%s\".", data_url, data_path)
            r = requests.get(data_url, allow_redirects=True)
            r.raise_for_status()
            os.makedirs(data_folder_path, exist_ok=True)
            with open(data_path, 'wb') as f:
                f.write(r.content)

        loaded_data = pd.read_csv(data_path)

        data = []
        labels = []
        class_index = list(loaded_data.columns).index('class') + 1
        assert class_index > 0
        for row in tqdm(loaded_data.itertuples(),
                        desc="Loading data",
                        unit_scale=True, mininterval=2, unit=" samples",
                        total=max_num_samples or len(loaded_data),
                        ):
            if max_num_samples is not None and len(data) > max_num_samples:
                break
            text = row.tweet
            text = self._pre_process(text)
            data.append(text)
            labels.append(self._class_mapping[row[class_index]])

        if train_size is None:
            if test_size is None:
                train_size = int(self._train_split * len(data))
            else:
                train_size = len(data) - test_size
        if test_size is None:
            test_size = len(data) - train_size

        data, labels = shuffle(data, labels, random_state=self._seed)
        x_train = itertools.islice(data, train_size)

        # Compute the top features.
        t = TfidfVectorizer(max_features=self.max_num_features, norm=None)
        t.fit(tqdm(x_train,
                   desc="Computing top token features",
                   total=train_size,
                   unit_scale=True, mininterval=2,
                   unit=" texts"
                   ))
        top_tokens = t.get_feature_names()
        self._logger.debug("Some top feature names: %s", top_tokens[:30])

        tokenize = t.build_analyzer()
        feature_tokens = set(t.get_feature_names())

        def _featurize(text: str) -> Dict[int, int]:
            result = Counter(tokenize(text))

            return {self._token_hash.hash(token): count
                    for token, count in result.items()
                    if token in feature_tokens}

        x_train = map(_featurize, itertools.islice(data, train_size))
        x_train = self._build_sparse_matrix(x_train)
        y_train = np.array(labels[:train_size])

        x_test = map(_featurize, itertools.islice(data, len(data) - test_size, len(data)))
        # TODO Might have to might sure it has the same number of columns as x_train.
        x_test = self._build_sparse_matrix(x_test)
        y_test = np.array(labels[-test_size:])

        self._logger.info("Done loading data.")
        return (x_train, y_train), (x_test, y_test)