tutorials/tensorflow/mlflow_gcp/trainer/utils.py (127 lines of code) (raw):
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities to download and preprocess the Census data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from builtins import list
from builtins import zip
from builtins import map
from builtins import str
from six.moves import urllib
import tempfile
import os
import string
import random
import numpy as np
import pandas as pd
import tensorflow as tf
# Storage directory
DATA_DIR = os.path.join(tempfile.gettempdir(), 'census_data')
# Download options.
DATA_URL = ('https://storage.googleapis.com/cloud-samples-data/ml-engine/census'
'/data')
TRAINING_FILE = 'adult.data.csv'
EVAL_FILE = 'adult.test.csv'
TRAINING_URL = '%s/%s' % (DATA_URL, TRAINING_FILE)
EVAL_URL = '%s/%s' % (DATA_URL, EVAL_FILE)
# These are the features in the dataset.
# Dataset information: https://archive.ics.uci.edu/ml/datasets/census+income
_CSV_COLUMNS = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# This is the label (target) we want to predict.
_LABEL_COLUMN = 'income_bracket'
# These are columns we will not use as features for training. There are many
# reasons not to use certain attributes of data for training. Perhaps their
# values are noisy or inconsistent, or perhaps they encode bias that we do not
# want our model to learn. For a deep dive into the features of this Census
# dataset and the challenges they pose, see the Introduction to ML Fairness
# Notebook: https://colab.research.google.com/github/google/eng-edu/blob
# /master/ml/cc/exercises/intro_to_fairness.ipynb
UNUSED_COLUMNS = ['fnlwgt', 'education', 'gender']
_CATEGORICAL_TYPES = {
'workclass': pd.api.types.CategoricalDtype(categories=[
'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc',
'Self-emp-not-inc', 'State-gov', 'Without-pay'
]),
'marital_status': pd.api.types.CategoricalDtype(categories=[
'Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'
]),
'occupation': pd.api.types.CategoricalDtype([
'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv',
'Sales', 'Tech-support', 'Transport-moving'
]),
'relationship': pd.api.types.CategoricalDtype(categories=[
'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried',
'Wife'
]),
'race': pd.api.types.CategoricalDtype(categories=[
'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'
]),
'native_country': pd.api.types.CategoricalDtype(categories=[
'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic',
'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece',
'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong',
'Hungary',
'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos',
'Mexico',
'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines',
'Poland',
'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand',
'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'
]),
'income_bracket': pd.api.types.CategoricalDtype(categories=[
'<=50K', '>50K'
])
}
def _download_and_clean_file(filename, url):
"""Downloads data from url, and makes changes to match the CSV format.
The CSVs may use spaces after the comma delimters (non-standard) or include
rows which do not represent well-formed examples. This function strips out
some of these problems.
Args:
filename: filename to save url to
url: URL of resource to download
"""
temp_file, _ = urllib.request.urlretrieve(url)
with tf.io.gfile.GFile(temp_file, 'r') as temp_file_object:
with tf.io.gfile.GFile(filename, 'w') as file_object:
for line in temp_file_object:
line = line.strip()
line = line.replace(', ', ',')
if not line or ',' not in line:
continue
if line[-1] == '.':
line = line[:-1]
line += '\n'
file_object.write(line)
tf.io.gfile.remove(temp_file)
def download(data_dir):
"""Downloads census data if it is not already present.
Args:
data_dir: directory where we will access/save the census data
"""
tf.io.gfile.makedirs(data_dir)
training_file_path = os.path.join(data_dir, TRAINING_FILE)
if not tf.io.gfile.exists(training_file_path):
_download_and_clean_file(training_file_path, TRAINING_URL)
eval_file_path = os.path.join(data_dir, EVAL_FILE)
if not tf.io.gfile.exists(eval_file_path):
_download_and_clean_file(eval_file_path, EVAL_URL)
return training_file_path, eval_file_path
def preprocess(dataframe):
"""Converts categorical features to numeric. Removes unused columns.
Args:
dataframe: Pandas dataframe with raw data
Returns:
Dataframe with preprocessed data
"""
dataframe = dataframe.drop(columns=UNUSED_COLUMNS)
# Convert integer valued (numeric) columns to floating point
numeric_columns = dataframe.select_dtypes(['int64']).columns
dataframe[numeric_columns] = dataframe[numeric_columns].astype('float32')
# Convert categorical columns to numeric
cat_columns = dataframe.select_dtypes(['object']).columns
dataframe[cat_columns] = dataframe[cat_columns].apply(lambda x: x.astype(
_CATEGORICAL_TYPES[x.name]))
dataframe[cat_columns] = dataframe[cat_columns].apply(lambda x: x.cat.codes)
return dataframe
def standardize(dataframe):
"""Scales numerical columns using their means and standard deviation to get
z-scores: the mean of each numerical column becomes 0, and the standard
deviation becomes 1. This can help the model converge during training.
Args:
dataframe: Pandas dataframe
Returns:
Input dataframe with the numerical columns scaled to z-scores
"""
dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes)))
# Normalize numeric columns.
for column, dtype in dtypes:
if dtype == 'float32':
dataframe[column] -= dataframe[column].mean()
dataframe[column] /= dataframe[column].std()
return dataframe
def load_data(training_file_path, eval_file_path, *args, **kwargs):
"""Loads data into preprocessed (train_x, train_y, eval_y, eval_y)
dataframes.
Args:
training_file_path: GCS file location for training files
eval_file_path: GCS file location for eval files
Returns:
A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are
Pandas dataframes with features for training and train_y and eval_y are
numpy arrays with the corresponding labels.
"""
# TODO Download and clean custom files.
print('Location train file: {}, eval file {}'.format(training_file_path,
eval_file_path))
training_file_path, eval_file_path = download(DATA_DIR)
# This census data uses the value '?' for missing entries. We use
# na_values to
# find ? and set it to NaN.
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv
# .html
train_df = pd.read_csv(training_file_path, names=_CSV_COLUMNS,
na_values='?')
eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values='?')
train_df = preprocess(train_df)
eval_df = preprocess(eval_df)
# Split train and eval data with labels. The pop method copies and removes
# the label column from the dataframe.
train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN)
eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN)
# Join train_x and eval_x to normalize on overall means and standard
# deviations. Then separate them again.
all_x = pd.concat([train_x, eval_x], keys=['train', 'eval'])
all_x = standardize(all_x)
train_x, eval_x = all_x.xs('train'), all_x.xs('eval')
# Reshape label columns for use with tf.data.Dataset
train_y = np.asarray(train_y).astype('float32').reshape((-1, 1))
eval_y = np.asarray(eval_y).astype('float32').reshape((-1, 1))
return train_x, train_y, eval_x, eval_y
def get_run_id(size):
chars = string.ascii_uppercase + string.ascii_lowercase
return ''.join(random.choice(chars) for _ in range(size))