def main()

in src/data_utils.py [0:0]


def main():
    """
    Dataset preparation for mangrove classifier training
    """
    # select bucket to store dataset
    s3_bucket = "sagemaker-gis"
    
    # select satellite data, year and bands
    base_sat_data = "LANDSAT/LC08/C01/T1_SR"
    year = 2015
    bands = "B[1-7]"

    meta_dict = {"src_dataset": base_sat_data.replace("/", "_"), "year": year}
    date_range = [f"{year}-01-01", f"{year}-12-31"]

    # read representative coordinates for each region
    df_zones = pd.read_csv("zones.csv").set_index("region")

    # create dataset for each region
    for area in df_zones.index:
        print(f"processing data for {area}...")
        point_of_int = df_zones.loc[area, ["lon", "lat"]].tolist()
        data_dict = get_data_by_zone_year(
            point_of_int, date_range, base_sat_data, bands
        )
        meta_dict["poi"] = area.replace(" ", "_")
        save_regional_data(data_dict, meta_dict, s3_bucket)

    # split the dataset between training and test sets
    areas_for_test = ["Vietnam2", "Myanmar3", "Cuba2", "India"]
    folder = f"{meta_dict['src_dataset']}/Year{meta_dict['year']}"
    split_dataset(areas_for_test, s3_bucket, folder)