in courses/DSL/challenge-mlprep/fraud_detection/trainer/model.py [0:0]
def get_input_and_transform(ds, num_bins, hash_bkts):
inputs = {}
# Create Input layers with appropriate datatype for each column
for col in INT_COLS:
inputs[col] = Input(name=col, shape=(1,), dtype=tf.int32)
for col in CAT_COLS:
inputs[col] = Input(name=col, shape=(1,), dtype=tf.string)
for col in FLOAT_COLS:
inputs[col] = Input(name=col, shape=(1,), dtype=tf.float32)
feature_ds = ds.map(lambda x, y: x)
# Helper function to get column from dataset
def _get_col(ds,col):
return ds[col]
transformed = {}
# One-hot encode integer valued columns
layer = IntegerLookup(output_mode='one_hot')
logging.warn("Adapting 'step' IntegerLookup layer.")
layer.adapt(feature_ds.map(partial(_get_col, col='step')), steps=100)
transformed['step'] = layer(inputs['step'])
# One-hot encode categorical columns
layer = StringLookup(output_mode='one_hot')
logging.warn("Adapting 'action' StringLookup layer.")
layer.adapt(feature_ds.map(partial(_get_col, col='action')), steps=100)
transformed['action'] = layer(inputs['action'])
# Bucketize float-valued columns into num_bins buckets
for col in FLOAT_COLS:
layer = Discretization(num_bins=num_bins, output_mode='one_hot')
logging.warn(f"Adapting {col} Discretization layer.")
layer.adapt(feature_ds.map(partial(_get_col, col=col)), steps=100)
# layer = CategoryEncoding(num_bins, output_mode='one_hot')
transformed[col] = layer(inputs[col])
# Use hash buckets for idOrig and idDest features to minimize sparsity
for col in ['idOrig', 'idDest']:
layer = Hashing(hash_bkts, output_mode='one_hot')
transformed[col] = layer(inputs[col])
return inputs, transformed