in tutorials/tensorflow/mlflow_gcp/trainer/utils.py [0:0]
def load_data(training_file_path, eval_file_path, *args, **kwargs):
"""Loads data into preprocessed (train_x, train_y, eval_y, eval_y)
dataframes.
Args:
training_file_path: GCS file location for training files
eval_file_path: GCS file location for eval files
Returns:
A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are
Pandas dataframes with features for training and train_y and eval_y are
numpy arrays with the corresponding labels.
"""
# TODO Download and clean custom files.
print('Location train file: {}, eval file {}'.format(training_file_path,
eval_file_path))
training_file_path, eval_file_path = download(DATA_DIR)
# This census data uses the value '?' for missing entries. We use
# na_values to
# find ? and set it to NaN.
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv
# .html
train_df = pd.read_csv(training_file_path, names=_CSV_COLUMNS,
na_values='?')
eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values='?')
train_df = preprocess(train_df)
eval_df = preprocess(eval_df)
# Split train and eval data with labels. The pop method copies and removes
# the label column from the dataframe.
train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN)
eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN)
# Join train_x and eval_x to normalize on overall means and standard
# deviations. Then separate them again.
all_x = pd.concat([train_x, eval_x], keys=['train', 'eval'])
all_x = standardize(all_x)
train_x, eval_x = all_x.xs('train'), all_x.xs('eval')
# Reshape label columns for use with tf.data.Dataset
train_y = np.asarray(train_y).astype('float32').reshape((-1, 1))
eval_y = np.asarray(eval_y).astype('float32').reshape((-1, 1))
return train_x, train_y, eval_x, eval_y