in scripts/preprocessing.py [0:0]
def main(args):
"""
Runs preprocessing for the example data set
1. Pulls data from the Athena database
2. Splits data into training and testing
3. Preprocess categorical and numerical test_features
4. Writes preprocessed data to S3
Args:
database (str, required): Athena database to query data from
table (str, required): Athena table name to query data from
region (str, required): AWS Region for queries
train-test-split-ratio (float): Percentage to split the data into
, default is 25%
random-state (float): Random seed used for train and test split
, default is 123
"""
logger.debug(f"Received arguments {args}")
DATABASE, TABLE, region = args.database, args.table, args.region
boto3.setup_default_session(region_name=f"{region}")
df = wr.athena.read_sql_query(
f'SELECT * FROM "{TABLE}"', database=DATABASE, ctas_approach=False
)
df = df[columns]
df = df.astype(col_type)
logger.info(df.dtypes)
df = df.drop(["area code", "phone"], 1)
df = df.dropna()
df = df.drop_duplicates()
# fix class labels to binary
df[target_col] = df[target_col].replace(class_labels, [1, 0])
negative_examples, positive_examples = df[target_col].value_counts().values
logger.info(