in gce/burst-training/census-analysis.py [0:0]
def main(mode, census_data_path, model_output_path, cv_iterations=1):
TRAIN_DATA = os.path.join(census_data_path, 'adult.data')
TEST_DATA = os.path.join(census_data_path, 'adult.test')
COLUMNS = (
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income-level'
)
CATEGORICAL_COLUMNS = (
'workclass',
'education',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'native-country'
)
with tf.gfile.Open(TRAIN_DATA, 'r') as train_data:
train_raw_df = pd.read_csv(train_data, header=None, names=COLUMNS)
train_features_df = train_raw_df.drop('income-level', axis=1)
train_labels_df = (train_raw_df['income-level'] == ' >50K')
with tf.gfile.Open(TEST_DATA, 'r') as test_data:
test_raw_df = pd.read_csv(test_data, names=COLUMNS, skiprows=1)
test_features_df = test_raw_df.drop('income-level', axis=1)
test_labels_df = (test_raw_df['income-level'] == ' >50K.')
if mode == 'train':
encoders = {col:sklearn.preprocessing.LabelEncoder()
for col in CATEGORICAL_COLUMNS}
for col in CATEGORICAL_COLUMNS:
train_features_df[col] = encoders[col].fit_transform(
train_features_df[col])
classifier_0 = GradientBoostingClassifier()
classifier = RandomizedSearchCV(
classifier_0,
param_distributions={
'learning_rate': list(np.arange(0.01, 0.2, 0.01)),
'max_depth': [3,4,5,6,7],
'min_samples_split': [2,3,4,5,6],
'min_samples_leaf': [1,2,3],
'n_estimators': range(80, 201, 10)
},
n_iter=cv_iterations,
n_jobs=-1,
verbose=10
)
classifier.fit(train_features_df, train_labels_df)
model_export = {
'preprocessor': encoders,
'classifier': classifier
}
with tf.gfile.Open(model_output_path, 'wb') as model_file:
joblib.dump(model_export, model_file, protocol=1)
elif mode == 'evaluate':
with tf.gfile.Open(model_output_path, 'rb') as model_file:
saved_model = joblib.load(model_file)
encoders = saved_model['preprocessor']
classifier = saved_model['classifier']
else:
raise ValueError('Invalid mode: {}'.format(mode))
for col in CATEGORICAL_COLUMNS:
test_features_df[col] = encoders[col].transform(test_features_df[col])
final_score = classifier.score(test_features_df, test_labels_df)
return (final_score, model_output_path, classifier.best_params_)