def main()

in scripts/inferpreprocessing.py [0:0]
35 lines of code
5 McCabe index (conditional complexity)

def main(args):
    """
    Runs preprocessing for the example data set
        1. Pulls data from the Athena database
        2. Transforms features using the saved preprocessor
        3. Writes preprocessed data to S3

    Args:
        database (str, required): Athena database to query data from
        table (str, required): Athena table name to query data from
        region (str, required): AWS Region for queries
        coxph (bool): Flag indicating that it's a cox proportional hazard model,
        default False
    """

    logger.info(f"Received arguments {args}")
    DATABASE, TABLE, region = args.database, args.table, args.region

    boto3.setup_default_session(region_name=f"{region}")
    df = wr.athena.read_sql_query(
        f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
    )

    df = df[columns]
    df = df.astype(col_type)
    logger.info(df.dtypes)

    df = df.drop(["area code", "phone"], 1)
    df = df.dropna()

    if args.coxph:
        del df["account length"]

    # no fit predict method currently supported for DenseClus
    # See: https://github.com/awslabs/amazon-denseclus/issues/4
    if args.cluster:

        logger.info("Clustering data")
        clf = DenseClus()
        clf.fit(df)
        logger.info("Clusters fit")

        df["segments"] = clf.score()
        df["segments"] = df["segments"].astype(str)

    logger.info("Load Preprocessing Model")
    preprocess = joblib.load("/opt/ml/processing/transformer/preprocessor.joblib")

    logger.info("Running feature engineering transformations")
    test_features = preprocess.transform(df)

    logger.info(f"Infer data shape after preprocessing: {test_features.shape}")

    test_features_output_path = os.path.join(
        "/opt/ml/processing/infer", "infer_features.csv"
    )
    if isinstance(test_features, pd.DataFrame):
        test_features.to_csv(test_features_output_path, header=False, index=False)
    else:
        pd.DataFrame(test_features).to_csv(
            test_features_output_path, header=False, index=False
        )