fraud-detection-notebook/credit-card-fraud-detection-v1.ipynb (313 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import uuid\n", "import itertools\n", "import numpy as np\n", "import pandas as pd\n", "import tensorflow as tf\n", "import json\n", "import matplotlib.pyplot as plt\n", "from sklearn.utils import shuffle\n", "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_ipython().system('gsutil cp gs://financial_fraud_detection/fraud_data_kaggle.csv .')\n", "data = pd.read_csv('fraud_data_kaggle.csv')\n", "data.head() " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fraud = data[data['isFraud'] == 1]\n", "not_fraud = data[data['isFraud'] == 0]\n", "# Take a random sample of non fraud rows\n", "not_fraud_sample = not_fraud.sample(random_state=2, frac=.005)\n", "\n", "# Put it back together and shuffle\n", "df = pd.concat([not_fraud_sample,fraud])\n", "df = shuffle(df, random_state=2)\n", "\n", "# Remove a few columns (isFraud is the label column we'll use, not isFlaggedFraud)\n", "df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])\n", "# Adding transaction id to identify back after prediction\n", "df['transactionId'] = [str(uuid.uuid4()) for _ in range(len(df.index))]\n", "# Preview the updated dataset\n", "df.head()\n", "train_test_split = int(len(df) * .8)\n", "\n", "train_set = df[:train_test_split]\n", "test_set = df[train_test_split:]\n", "\n", "train_labels = train_set.pop('isFraud')\n", "test_labels = test_set.pop('isFraud')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fc = tf.feature_column\n", "CATEGORICAL_COLUMNS = ['type']\n", "NUMERIC_COLUMNS = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']\n", "KEY_COLUMN = 'transactionId'\n", "def one_hot_cat_column(feature_name, vocab):\n", " return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))\n", "\n", "feature_columns = []\n", "\n", "for feature_name in CATEGORICAL_COLUMNS:\n", " vocabulary = train_set[feature_name].unique()\n", " feature_columns.append(one_hot_cat_column(feature_name, vocabulary))\n", "\n", "for feature_name in NUMERIC_COLUMNS:\n", " feature_columns.append(tf.feature_column.numeric_column(feature_name,\n", " dtype=tf.float32))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "NUM_EXAMPLES = len(train_labels)\n", "def make_input_fn(X, y, n_epochs=None, shuffle=True):\n", " def input_fn():\n", " dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))\n", " if shuffle:\n", " dataset = dataset.shuffle(NUM_EXAMPLES)\n", " dataset = dataset.repeat(n_epochs)\n", " dataset = dataset.batch(NUM_EXAMPLES)\n", " return dataset\n", " return input_fn\n", "\n", "# Define training and evaluation input functions\n", "train_input_fn = make_input_fn(train_set, train_labels)\n", "eval_input_fn = make_input_fn(test_set, test_labels, shuffle=False, n_epochs=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_batches = 1\n", "model = tf.estimator.BoostedTreesClassifier(feature_columns,\n", " n_batches_per_layer=n_batches)\n", "model = tf.contrib.estimator.forward_features(model,KEY_COLUMN)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.train(train_input_fn, max_steps=100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result = model.evaluate(eval_input_fn)\n", "print(pd.Series(result))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pred_dicts = list(model.predict(eval_input_fn))\n", "probabilities = pd.Series([pred['logistic'][0] for pred in pred_dicts])\n", "\n", "for i,val in enumerate(probabilities[:30]):\n", " print('Predicted: ', round(val), 'Actual: ', test_labels.iloc[i])\n", " print()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "GCP_PROJECT = '' # @param {type:\"string\"}\n", "MODEL_BUCKET = '' # @param {type:\"string\"}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def json_serving_input_fn():\n", " feature_placeholders = {\n", " 'type': tf.placeholder(tf.string, [None]),\n", " 'step': tf.placeholder(tf.float32, [None]),\n", " 'amount': tf.placeholder(tf.float32, [None]),\n", " 'oldbalanceOrg': tf.placeholder(tf.float32, [None]),\n", " 'newbalanceOrig': tf.placeholder(tf.float32, [None]),\n", " 'oldbalanceDest': tf.placeholder(tf.float32, [None]),\n", " 'newbalanceDest': tf.placeholder(tf.float32, [None]),\n", " KEY_COLUMN: tf.placeholder_with_default(tf.constant(['nokey']), [None])\n", " }\n", " features = {key: tf.expand_dims(tensor, -1)\n", " for key, tensor in feature_placeholders.items()}\n", " return tf.estimator.export.ServingInputReceiver(features,feature_placeholders)\n", "export_path = model.export_saved_model(\n", " MODEL_BUCKET + '/explanations-with-key',\n", " serving_input_receiver_fn=json_serving_input_fn\n", ").decode('utf-8')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!saved_model_cli show --dir $export_path --all" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MODEL = 'fraud_detection_with_key'\n", "REGION = 'us-central1'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!gcloud services enable ml.googleapis.com\n", "!gcloud ai-platform models create $MODEL --region $REGION --quiet" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "VERSION = 'v1'\n", "!gcloud beta ai-platform versions create $VERSION \\\n", "--model $MODEL \\\n", "--origin $export_path \\\n", "--runtime-version 1.15 \\\n", "--framework TENSORFLOW \\\n", "--python-version 3.7 \\\n", "--machine-type n1-standard-4 \\\n", "--num-paths 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fraud_indices = []\n", "\n", "for i,val in enumerate(test_labels):\n", " if val == 1:\n", " fraud_indices.append(i)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_test_examples = 10\n", "import numpy as np \n", "\n", "def convert(o):\n", " if isinstance(o, np.generic): return o.item() \n", " raise TypeError\n", "\n", "for i in range(num_test_examples):\n", " test_json = {}\n", " ex = test_set.iloc[fraud_indices[i]]\n", " keys = ex.keys().tolist()\n", " vals = ex.values.tolist()\n", " for idx in range(len(keys)):\n", " test_json[keys[idx]] = vals[idx]\n", "\n", " print(test_json)\n", " with open('data.txt', 'a') as outfile:\n", " json.dump(test_json, outfile, default=convert)\n", " outfile.write('\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!gcloud ai-platform predict --model $MODEL \\\n", "--version $VERSION \\\n", "--json-instances='data.txt' \\\n", "--signature-name='predict'" ] } ], "metadata": { "environment": { "name": "tf-gpu.1-15.m46", "type": "gcloud", "uri": "gcr.io/deeplearning-platform-release/tf-gpu.1-15:m46" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }