def main()

in scripts/coxph_preprocessing.py [0:0]


def main(args):
    """
    Runs preprocessing for the example data set
        1. Pulls data from the Athena database
        2. Splits data into training and testing
        3. Preprocess categorical and numerical test_features
        4. Writes preprocessed data to S3

    Args:
        database (str, required): Athena database to query data from
        table (str, required): Athena table name to query data from
        region (str, required): AWS Region for queries
        train-test-split-ratio (float): Percentage to split the data into
        , default is 25%
        random-state (float): Random seed used for train and test split
        , default is 123
    """
    logger.debug(f"Received arguments {args}")
    DATABASE, TABLE, REGION = args.database, args.table, args.region

    logger.info("Querying Athena...")
    boto3.setup_default_session(region_name=f"{REGION}")
    df = wr.athena.read_sql_query(
        f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
    )
    df = df[columns]
    df = df.astype(col_type)
    logger.info(df.dtypes)

    df["event"] = np.where(df["churn?"] == "False.", 0, 1)
    del df["churn?"]
    df = df.rename(columns={"account length": "duration"})

    df = df.drop(["area code", "phone"], 1)
    df = df.dropna()
    df = df.drop_duplicates()

    negative_examples, positive_examples = df["event"].value_counts().values
    print(