in simulation/decai/simulation/data/news_data_loader.py [0:0]
def _load_kaggle_data(self, data_folder_path: str) -> Collection[News]:
"""
Load data from https://www.kaggle.com/c/fake-news/data.
"""
# Don't use the test data because it has no labels.
fake_news_data_path = os.path.join(data_folder_path, 'fake-news', 'train.csv')
if not os.path.exists(fake_news_data_path):
raise Exception(f"Could not find the Fake News dataset at \"{fake_news_data_path}\"."
"\nYou must obtain it from https://www.kaggle.com/c/fake-news/data.")
data = pd.read_csv(fake_news_data_path, na_values=dict(text=[]), keep_default_na=False)
result = []
for row in data.itertuples():
label = Label.RELIABLE if row.label == 0 else Label.UNRELIABLE
if len(row.text) > 0:
result.append(News(row.text, label))
# Consistent shuffle to aim for a mostly even distribution of labels.
random.shuffle(result, lambda: 0.618)
return result