in 1. Amazon SageMaker Processing/preprocess.py [0:0]
def main(base_dir: str, args: argparse.Namespace):
# Input data files
input_dir = os.path.join(base_dir, "input/data")
input_file_list = glob.glob(f"{input_dir}/*.csv")
logger.info(f"Input file list: {input_file_list}")
hosts = _read_json("/opt/ml/config/resourceconfig.json")
logger.info(hosts)
current_host = hosts["current_host"]
logger.info(current_host)
if len(input_file_list) == 0:
raise Exception(f"No input files found in {input_dir}")
# Input zones file
zones_dir = os.path.join(base_dir, "input/zones")
zones_file = os.path.join(zones_dir, "taxi_zones.zip")
if not os.path.exists(zones_file):
raise Exception(f"Zones file {zones_file} does not exist")
# Extract and load taxi zones geopandas dataframe
extract_zones(zones_file, zones_dir)
zone_df = load_zones(zones_dir)
# Load input files
data_df = load_data(input_file_list)
data_df = enrich_data(data_df, zone_df)
data_df, data_fg = clean_data(data_df)
fg_name = args.ingest_featuregroup_name
sagemaker_session = get_session(args.region, args.bucket)
return save_files(base_dir, data_df, data_fg, fg_name, current_host=current_host, sagemaker_session=sagemaker_session)