in targeted-marketing-python/build_model.py [0:0]
def create_data_sources(ml, data_s3_url, schema_fn, train_percent, name):
"""Create two data sources. One with (train_percent)% of the data,
which will be used for training. The other one with the remainder of the data,
which is commonly called the "test set" and will be used to evaluate the quality
of the ML Model.
"""
train_ds_id = 'ds-' + base64.b32encode(os.urandom(10))
spec = {
"DataLocationS3": data_s3_url,
"DataRearrangement": json.dumps({
"splitting": {
"percentBegin": 0,
"percentEnd": train_percent
}
}),
"DataSchema": open(schema_fn).read(),
}
ml.create_data_source_from_s3(
DataSourceId=train_ds_id,
DataSpec=spec,
DataSourceName=name + " - training split",
ComputeStatistics=True
)
print("Created training data set %s" % train_ds_id)
test_ds_id = 'ds-' + base64.b32encode(os.urandom(10))
spec['DataRearrangement'] = json.dumps({
"splitting": {
"percentBegin": train_percent,
"percentEnd": 100
}
})
ml.create_data_source_from_s3(
DataSourceId=test_ds_id,
DataSpec=spec,
DataSourceName=name + " - testing split",
ComputeStatistics=True
)
print("Created test data set %s" % test_ds_id)
return (train_ds_id, test_ds_id)