in scripts/inferpreprocessing.py [0:0]
def main(args):
"""
Runs preprocessing for the example data set
1. Pulls data from the Athena database
2. Transforms features using the saved preprocessor
3. Writes preprocessed data to S3
Args:
database (str, required): Athena database to query data from
table (str, required): Athena table name to query data from
region (str, required): AWS Region for queries
coxph (bool): Flag indicating that it's a cox proportional hazard model,
default False
"""
logger.info(f"Received arguments {args}")
DATABASE, TABLE, region = args.database, args.table, args.region
boto3.setup_default_session(region_name=f"{region}")
df = wr.athena.read_sql_query(
f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
)
df = df[columns]
df = df.astype(col_type)
logger.info(df.dtypes)
df = df.drop(["area code", "phone"], 1)
df = df.dropna()
if args.coxph:
del df["account length"]
# no fit predict method currently supported for DenseClus
# See: https://github.com/awslabs/amazon-denseclus/issues/4
if args.cluster:
logger.info("Clustering data")
clf = DenseClus()
clf.fit(df)
logger.info("Clusters fit")
df["segments"] = clf.score()
df["segments"] = df["segments"].astype(str)
logger.info("Load Preprocessing Model")
preprocess = joblib.load("/opt/ml/processing/transformer/preprocessor.joblib")
logger.info("Running feature engineering transformations")
test_features = preprocess.transform(df)
logger.info(f"Infer data shape after preprocessing: {test_features.shape}")
test_features_output_path = os.path.join(
"/opt/ml/processing/infer", "infer_features.csv"
)
if isinstance(test_features, pd.DataFrame):
test_features.to_csv(test_features_output_path, header=False, index=False)
else:
pd.DataFrame(test_features).to_csv(
test_features_output_path, header=False, index=False
)