in scripts/coxph_preprocessing.py [0:0]
def main(args):
"""
Runs preprocessing for the example data set
1. Pulls data from the Athena database
2. Splits data into training and testing
3. Preprocess categorical and numerical test_features
4. Writes preprocessed data to S3
Args:
database (str, required): Athena database to query data from
table (str, required): Athena table name to query data from
region (str, required): AWS Region for queries
train-test-split-ratio (float): Percentage to split the data into
, default is 25%
random-state (float): Random seed used for train and test split
, default is 123
"""
logger.debug(f"Received arguments {args}")
DATABASE, TABLE, REGION = args.database, args.table, args.region
logger.info("Querying Athena...")
boto3.setup_default_session(region_name=f"{REGION}")
df = wr.athena.read_sql_query(
f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
)
df = df[columns]
df = df.astype(col_type)
logger.info(df.dtypes)
df["event"] = np.where(df["churn?"] == "False.", 0, 1)
del df["churn?"]
df = df.rename(columns={"account length": "duration"})
df = df.drop(["area code", "phone"], 1)
df = df.dropna()
df = df.drop_duplicates()
negative_examples, positive_examples = df["event"].value_counts().values
print(