in source/sagemaker/data-preprocessing/graph_data_preprocessor.py [0:0]
def load_data(data_dir, transaction_data, identity_data, train_data_ratio, output_dir):
transaction_df = pd.read_csv(os.path.join(data_dir, transaction_data))
logging.info("Shape of transaction data is {}".format(transaction_df.shape))
logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.isFraud.isnull().sum()))
identity_df = pd.read_csv(os.path.join(data_dir, identity_data))
logging.info("Shape of identity data is {}".format(identity_df.shape))
# extract out transactions for test/validation
n_train = int(transaction_df.shape[0]*train_data_ratio)
test_ids = transaction_df.TransactionID.values[n_train:]
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
logging.info("Percent fraud for train transactions: {}".format(get_fraud_frac(transaction_df.isFraud[:n_train])))
logging.info("Percent fraud for test transactions: {}".format(get_fraud_frac(transaction_df.isFraud[n_train:])))
logging.info("Percent fraud for all transactions: {}".format(get_fraud_frac(transaction_df.isFraud)))
with open(os.path.join(output_dir, 'test.csv'), 'w') as f:
f.writelines(map(lambda x: str(x) + "\n", test_ids))
logging.info("Wrote test to file: {}".format(os.path.join(output_dir, 'test.csv')))
return transaction_df, identity_df, test_ids