fraud-detection-notebook/credit-card-fraud-detection-v1.ipynb (313 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import uuid\n",
"import itertools\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import json\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.utils import shuffle\n",
"from sklearn.metrics import confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_ipython().system('gsutil cp gs://financial_fraud_detection/fraud_data_kaggle.csv .')\n",
"data = pd.read_csv('fraud_data_kaggle.csv')\n",
"data.head() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fraud = data[data['isFraud'] == 1]\n",
"not_fraud = data[data['isFraud'] == 0]\n",
"# Take a random sample of non fraud rows\n",
"not_fraud_sample = not_fraud.sample(random_state=2, frac=.005)\n",
"\n",
"# Put it back together and shuffle\n",
"df = pd.concat([not_fraud_sample,fraud])\n",
"df = shuffle(df, random_state=2)\n",
"\n",
"# Remove a few columns (isFraud is the label column we'll use, not isFlaggedFraud)\n",
"df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])\n",
"# Adding transaction id to identify back after prediction\n",
"df['transactionId'] = [str(uuid.uuid4()) for _ in range(len(df.index))]\n",
"# Preview the updated dataset\n",
"df.head()\n",
"train_test_split = int(len(df) * .8)\n",
"\n",
"train_set = df[:train_test_split]\n",
"test_set = df[train_test_split:]\n",
"\n",
"train_labels = train_set.pop('isFraud')\n",
"test_labels = test_set.pop('isFraud')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fc = tf.feature_column\n",
"CATEGORICAL_COLUMNS = ['type']\n",
"NUMERIC_COLUMNS = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']\n",
"KEY_COLUMN = 'transactionId'\n",
"def one_hot_cat_column(feature_name, vocab):\n",
" return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))\n",
"\n",
"feature_columns = []\n",
"\n",
"for feature_name in CATEGORICAL_COLUMNS:\n",
" vocabulary = train_set[feature_name].unique()\n",
" feature_columns.append(one_hot_cat_column(feature_name, vocabulary))\n",
"\n",
"for feature_name in NUMERIC_COLUMNS:\n",
" feature_columns.append(tf.feature_column.numeric_column(feature_name,\n",
" dtype=tf.float32))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"NUM_EXAMPLES = len(train_labels)\n",
"def make_input_fn(X, y, n_epochs=None, shuffle=True):\n",
" def input_fn():\n",
" dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))\n",
" if shuffle:\n",
" dataset = dataset.shuffle(NUM_EXAMPLES)\n",
" dataset = dataset.repeat(n_epochs)\n",
" dataset = dataset.batch(NUM_EXAMPLES)\n",
" return dataset\n",
" return input_fn\n",
"\n",
"# Define training and evaluation input functions\n",
"train_input_fn = make_input_fn(train_set, train_labels)\n",
"eval_input_fn = make_input_fn(test_set, test_labels, shuffle=False, n_epochs=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_batches = 1\n",
"model = tf.estimator.BoostedTreesClassifier(feature_columns,\n",
" n_batches_per_layer=n_batches)\n",
"model = tf.contrib.estimator.forward_features(model,KEY_COLUMN)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.train(train_input_fn, max_steps=100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result = model.evaluate(eval_input_fn)\n",
"print(pd.Series(result))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pred_dicts = list(model.predict(eval_input_fn))\n",
"probabilities = pd.Series([pred['logistic'][0] for pred in pred_dicts])\n",
"\n",
"for i,val in enumerate(probabilities[:30]):\n",
" print('Predicted: ', round(val), 'Actual: ', test_labels.iloc[i])\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"GCP_PROJECT = '' # @param {type:\"string\"}\n",
"MODEL_BUCKET = '' # @param {type:\"string\"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def json_serving_input_fn():\n",
" feature_placeholders = {\n",
" 'type': tf.placeholder(tf.string, [None]),\n",
" 'step': tf.placeholder(tf.float32, [None]),\n",
" 'amount': tf.placeholder(tf.float32, [None]),\n",
" 'oldbalanceOrg': tf.placeholder(tf.float32, [None]),\n",
" 'newbalanceOrig': tf.placeholder(tf.float32, [None]),\n",
" 'oldbalanceDest': tf.placeholder(tf.float32, [None]),\n",
" 'newbalanceDest': tf.placeholder(tf.float32, [None]),\n",
" KEY_COLUMN: tf.placeholder_with_default(tf.constant(['nokey']), [None])\n",
" }\n",
" features = {key: tf.expand_dims(tensor, -1)\n",
" for key, tensor in feature_placeholders.items()}\n",
" return tf.estimator.export.ServingInputReceiver(features,feature_placeholders)\n",
"export_path = model.export_saved_model(\n",
" MODEL_BUCKET + '/explanations-with-key',\n",
" serving_input_receiver_fn=json_serving_input_fn\n",
").decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!saved_model_cli show --dir $export_path --all"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MODEL = 'fraud_detection_with_key'\n",
"REGION = 'us-central1'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gcloud services enable ml.googleapis.com\n",
"!gcloud ai-platform models create $MODEL --region $REGION --quiet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"VERSION = 'v1'\n",
"!gcloud beta ai-platform versions create $VERSION \\\n",
"--model $MODEL \\\n",
"--origin $export_path \\\n",
"--runtime-version 1.15 \\\n",
"--framework TENSORFLOW \\\n",
"--python-version 3.7 \\\n",
"--machine-type n1-standard-4 \\\n",
"--num-paths 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fraud_indices = []\n",
"\n",
"for i,val in enumerate(test_labels):\n",
" if val == 1:\n",
" fraud_indices.append(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_test_examples = 10\n",
"import numpy as np \n",
"\n",
"def convert(o):\n",
" if isinstance(o, np.generic): return o.item() \n",
" raise TypeError\n",
"\n",
"for i in range(num_test_examples):\n",
" test_json = {}\n",
" ex = test_set.iloc[fraud_indices[i]]\n",
" keys = ex.keys().tolist()\n",
" vals = ex.values.tolist()\n",
" for idx in range(len(keys)):\n",
" test_json[keys[idx]] = vals[idx]\n",
"\n",
" print(test_json)\n",
" with open('data.txt', 'a') as outfile:\n",
" json.dump(test_json, outfile, default=convert)\n",
" outfile.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gcloud ai-platform predict --model $MODEL \\\n",
"--version $VERSION \\\n",
"--json-instances='data.txt' \\\n",
"--signature-name='predict'"
]
}
],
"metadata": {
"environment": {
"name": "tf-gpu.1-15.m46",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform-release/tf-gpu.1-15:m46"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}