in preprocess/input_data_etl.py [0:0]
def read_data(inputcsv):
'''
Read input data for text pre processing
'''
df = pd.read_csv(inputcsv)
df_processed = cleanup(df,"comment_text")
df_processed = df_processed[0:100000]
train_data = np.array(df_processed['comment_text'])
train_labels = df_processed[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']].to_numpy()
npy_to_s3(train_data,src_bucket,"toxic_comments","train_data")
npy_to_s3(train_labels,src_bucket,"toxic_comments","train_labels")