def load_data()

in source/sagemaker/data-preprocessing/graph_data_preprocessor.py [0:0]


def load_data(data_dir, transaction_data, identity_data, train_data_ratio, output_dir):
    transaction_df = pd.read_csv(os.path.join(data_dir, transaction_data))
    logging.info("Shape of transaction data is {}".format(transaction_df.shape))
    logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.isFraud.isnull().sum()))

    identity_df = pd.read_csv(os.path.join(data_dir, identity_data))
    logging.info("Shape of identity data is {}".format(identity_df.shape))

    # extract out transactions for test/validation
    n_train = int(transaction_df.shape[0]*train_data_ratio)
    test_ids = transaction_df.TransactionID.values[n_train:]

    get_fraud_frac = lambda series: 100 * sum(series)/len(series)
    logging.info("Percent fraud for train transactions: {}".format(get_fraud_frac(transaction_df.isFraud[:n_train])))
    logging.info("Percent fraud for test transactions: {}".format(get_fraud_frac(transaction_df.isFraud[n_train:])))
    logging.info("Percent fraud for all transactions: {}".format(get_fraud_frac(transaction_df.isFraud)))

    with open(os.path.join(output_dir, 'test.csv'), 'w') as f:
        f.writelines(map(lambda x: str(x) + "\n", test_ids))
    logging.info("Wrote test to file: {}".format(os.path.join(output_dir, 'test.csv')))

    return transaction_df, identity_df, test_ids