def get_input_and_transform()

in courses/DSL/challenge-mlprep/fraud_detection/trainer/model.py [0:0]


def get_input_and_transform(ds, num_bins, hash_bkts):
    
    inputs = {}
    
    # Create Input layers with appropriate datatype for each column
    for col in INT_COLS:
        inputs[col] = Input(name=col, shape=(1,), dtype=tf.int32)
    for col in CAT_COLS:
        inputs[col] = Input(name=col, shape=(1,), dtype=tf.string)
    for col in FLOAT_COLS:
        inputs[col] = Input(name=col, shape=(1,), dtype=tf.float32)
    
    feature_ds = ds.map(lambda x, y: x)
    
    # Helper function to get column from dataset
    def _get_col(ds,col):
        return ds[col] 
    
    transformed = {}

    # One-hot encode integer valued columns
    layer = IntegerLookup(output_mode='one_hot')
    logging.warn("Adapting 'step' IntegerLookup layer.")
    layer.adapt(feature_ds.map(partial(_get_col, col='step')), steps=100)
    transformed['step'] = layer(inputs['step'])

    # One-hot encode categorical columns
    layer = StringLookup(output_mode='one_hot')
    logging.warn("Adapting 'action' StringLookup layer.")
    layer.adapt(feature_ds.map(partial(_get_col, col='action')), steps=100)
    transformed['action'] = layer(inputs['action'])

    # Bucketize float-valued columns into num_bins buckets
    for col in FLOAT_COLS:
        layer = Discretization(num_bins=num_bins, output_mode='one_hot')
        logging.warn(f"Adapting {col} Discretization layer.")
        layer.adapt(feature_ds.map(partial(_get_col, col=col)), steps=100)
        # layer = CategoryEncoding(num_bins, output_mode='one_hot')
        transformed[col] = layer(inputs[col])

    # Use hash buckets for idOrig and idDest features to minimize sparsity
    for col in ['idOrig', 'idDest']:
        layer = Hashing(hash_bkts, output_mode='one_hot')
        transformed[col] = layer(inputs[col])
                                
    return inputs, transformed