examples/kfold_cv/prepare_classification_data_set.py (60 lines of code) (raw):
#!/usr/bin/env python
# coding: utf-8
# Download and prepare training data set
# Create Ludwig model definition file
#
# Based on the [UCI Wisconsin Breast Cancer data set](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original))
#
import os.path
import shutil
import pandas as pd
import requests
import yaml
from sklearn.model_selection import train_test_split
# Constants
DATA_SET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
DATA_SET = 'wdbc.data'
DATA_DIR = './data'
RESULTS_DIR = 'results'
# Clean out previous results
print("Cleaning out old results")
if os.path.isfile(DATA_SET):
os.remove(DATA_SET)
if os.path.isfile('model_definition.yaml'):
os.remove('model_definition.yaml')
shutil.rmtree(RESULTS_DIR, ignore_errors=True)
shutil.rmtree(DATA_DIR, ignore_errors=True)
# Retrieve data from UCI Machine Learning Repository
# Download required data
print("Downloading data set")
r = requests.get(DATA_SET_URL)
if r.status_code == 200:
with open(DATA_SET, 'w') as f:
f.write(r.content.decode("utf-8"))
# create pandas dataframe from downloaded data
print("Preparing data for training")
raw_df = pd.read_csv(DATA_SET,
header=None,
sep=",", skipinitialspace=True)
raw_df.columns = ['ID', 'diagnosis'] + ['X' + str(i) for i in range(1, 31)]
# convert diagnosis attribute to binary format
raw_df['diagnosis'] = raw_df['diagnosis'].map({'M': 1, 'B': 0})
# Create train/test split
print("Saving training and test data sets")
train_df, test_df = train_test_split(raw_df, train_size=0.8, random_state=17)
os.mkdir(DATA_DIR)
train_df.to_csv(os.path.join(DATA_DIR, 'train.csv'), index=False)
test_df.to_csv(os.path.join(DATA_DIR, 'test.csv'), index=False)
print("Preparing Ludwig model definition")
# Create ludwig input_features
num_features = ['X' + str(i) for i in range(1, 31)]
input_features = []
# setup input features for numerical variables
for p in num_features:
a_feature = {'name': p, 'type': 'numerical',
'preprocessing': {'missing_value_strategy': 'fill_with_mean',
'normalization': 'zscore'}}
input_features.append(a_feature)
# Create ludwig output features
output_features = [
{
'name': 'diagnosis',
'type': 'binary',
'num_fc_layers': 2,
'fc_size': 64
}
]
# setup ludwig model definition
model_definition = {
'input_features': input_features,
'output_features': output_features,
'training': {
'epochs': 20,
'batch_size': 32
}
}
with open('model_definition.yaml', 'w') as f:
yaml.dump(model_definition, f)
print("Completed data preparation")