def read_data()

in preprocess/input_data_etl.py [0:0]


def read_data(inputcsv):
    '''
    Read input data for text pre processing
    '''
    df = pd.read_csv(inputcsv)
    df_processed = cleanup(df,"comment_text")
    df_processed = df_processed[0:100000]
    train_data = np.array(df_processed['comment_text'])
    train_labels = df_processed[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']].to_numpy()
    npy_to_s3(train_data,src_bucket,"toxic_comments","train_data")
    npy_to_s3(train_labels,src_bucket,"toxic_comments","train_labels")