in 10_mlops/model.py [0:0]
def create_model():
real = {
colname: tf.feature_column.numeric_column(colname)
for colname in
(
'dep_delay,taxi_out,distance,dep_hour,is_weekday,' +
'dep_airport_lat,dep_airport_lon,' +
'arr_airport_lat,arr_airport_lon'
).split(',')
}
sparse = {
'carrier': tf.feature_column.categorical_column_with_vocabulary_list('carrier',
vocabulary_list='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(
',')),
'origin': tf.feature_column.categorical_column_with_hash_bucket('origin', hash_bucket_size=1000),
'dest': tf.feature_column.categorical_column_with_hash_bucket('dest', hash_bucket_size=1000),
}
inputs = {
colname: tf.keras.layers.Input(name=colname, shape=(), dtype='float32')
for colname in real.keys()
}
inputs.update({
colname: tf.keras.layers.Input(name=colname, shape=(), dtype='string')
for colname in sparse.keys()
})
latbuckets = np.linspace(20.0, 50.0, NUM_BUCKETS).tolist() # USA
lonbuckets = np.linspace(-120.0, -70.0, NUM_BUCKETS).tolist() # USA
disc = {}
disc.update({
'd_{}'.format(key): tf.feature_column.bucketized_column(real[key], latbuckets)
for key in ['dep_airport_lat', 'arr_airport_lat']
})
disc.update({
'd_{}'.format(key): tf.feature_column.bucketized_column(real[key], lonbuckets)
for key in ['dep_airport_lon', 'arr_airport_lon']
})
# cross columns that make sense in combination
sparse['dep_loc'] = tf.feature_column.crossed_column(
[disc['d_dep_airport_lat'], disc['d_dep_airport_lon']], NUM_BUCKETS * NUM_BUCKETS)
sparse['arr_loc'] = tf.feature_column.crossed_column(
[disc['d_arr_airport_lat'], disc['d_arr_airport_lon']], NUM_BUCKETS * NUM_BUCKETS)
sparse['dep_arr'] = tf.feature_column.crossed_column([sparse['dep_loc'], sparse['arr_loc']], NUM_BUCKETS ** 4)
# embed all the sparse columns
embed = {
'embed_{}'.format(colname): tf.feature_column.embedding_column(col, NUM_EMBEDS)
for colname, col in sparse.items()
}
real.update(embed)
# one-hot encode the sparse columns
sparse = {
colname: tf.feature_column.indicator_column(col)
for colname, col in sparse.items()
}
model = wide_and_deep_classifier(
inputs,
linear_feature_columns=sparse.values(),
dnn_feature_columns=real.values(),
dnn_hidden_units=DNN_HIDDEN_UNITS)
return model