in simulation/decai/simulation/data/offensive_data_loader.py [0:0]
def load_data(self, train_size: int = None, test_size: int = None) -> (Tuple, Tuple):
self._logger.info("Loading data.")
data_folder_path = Path(__file__,
'../../../../training_data/offensive/hate-speech-and-offensive-language').resolve()
if train_size is not None and test_size is not None:
max_num_samples = train_size + test_size
else:
max_num_samples = None
data_path = data_folder_path / 'labeled_data.csv'
if not data_path.exists():
data_url = 'https://github.com/t-davidson/hate-speech-and-offensive-language/raw/master/data/labeled_data.csv'
self._logger.info("Downloading data from \"%s\" to \"%s\".", data_url, data_path)
r = requests.get(data_url, allow_redirects=True)
r.raise_for_status()
os.makedirs(data_folder_path, exist_ok=True)
with open(data_path, 'wb') as f:
f.write(r.content)
loaded_data = pd.read_csv(data_path)
data = []
labels = []
class_index = list(loaded_data.columns).index('class') + 1
assert class_index > 0
for row in tqdm(loaded_data.itertuples(),
desc="Loading data",
unit_scale=True, mininterval=2, unit=" samples",
total=max_num_samples or len(loaded_data),
):
if max_num_samples is not None and len(data) > max_num_samples:
break
text = row.tweet
text = self._pre_process(text)
data.append(text)
labels.append(self._class_mapping[row[class_index]])
if train_size is None:
if test_size is None:
train_size = int(self._train_split * len(data))
else:
train_size = len(data) - test_size
if test_size is None:
test_size = len(data) - train_size
data, labels = shuffle(data, labels, random_state=self._seed)
x_train = itertools.islice(data, train_size)
# Compute the top features.
t = TfidfVectorizer(max_features=self.max_num_features, norm=None)
t.fit(tqdm(x_train,
desc="Computing top token features",
total=train_size,
unit_scale=True, mininterval=2,
unit=" texts"
))
top_tokens = t.get_feature_names()
self._logger.debug("Some top feature names: %s", top_tokens[:30])
tokenize = t.build_analyzer()
feature_tokens = set(t.get_feature_names())
def _featurize(text: str) -> Dict[int, int]:
result = Counter(tokenize(text))
return {self._token_hash.hash(token): count
for token, count in result.items()
if token in feature_tokens}
x_train = map(_featurize, itertools.islice(data, train_size))
x_train = self._build_sparse_matrix(x_train)
y_train = np.array(labels[:train_size])
x_test = map(_featurize, itertools.islice(data, len(data) - test_size, len(data)))
# TODO Might have to might sure it has the same number of columns as x_train.
x_test = self._build_sparse_matrix(x_test)
y_test = np.array(labels[-test_size:])
self._logger.info("Done loading data.")
return (x_train, y_train), (x_test, y_test)