def main()

in scripts/preprocessing.py [0:0]


def main(args):
    """
    Runs preprocessing for the example data set
        1. Pulls data from the Athena database
        2. Splits data into training and testing
        3. Preprocess categorical and numerical test_features
        4. Writes preprocessed data to S3

    Args:
        database (str, required): Athena database to query data from
        table (str, required): Athena table name to query data from
        region (str, required): AWS Region for queries
        train-test-split-ratio (float): Percentage to split the data into
        , default is 25%
        random-state (float): Random seed used for train and test split
        , default is 123


    """
    logger.debug(f"Received arguments {args}")
    DATABASE, TABLE, region = args.database, args.table, args.region

    boto3.setup_default_session(region_name=f"{region}")
    df = wr.athena.read_sql_query(
        f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
    )
    df = df[columns]
    df = df.astype(col_type)
    logger.info(df.dtypes)

    df = df.drop(["area code", "phone"], 1)
    df = df.dropna()
    df = df.drop_duplicates()

    # fix class labels to binary
    df[target_col] = df[target_col].replace(class_labels, [1, 0])

    negative_examples, positive_examples = df[target_col].value_counts().values
    logger.info(