in 1. Amazon SageMaker Processing/preprocess.py [0:0]
def enrich_data(trip_df: pd.DataFrame, zone_df: pd.DataFrame):
# Join trip DF to zones for poth pickup and drop off locations
trip_df = gpd.GeoDataFrame(
trip_df.join(zone_df, on="PULocationID").join(
zone_df, on="DOLocationID", rsuffix="_DO", lsuffix="_PU"
)
)
trip_df["geo_distance"] = (
trip_df["centroid_PU"].distance(trip_df["centroid_DO"]) / 1000
)
# Add date parts
trip_df["lpep_pickup_datetime"] = pd.to_datetime(trip_df["lpep_pickup_datetime"])
trip_df["hour"] = trip_df["lpep_pickup_datetime"].dt.hour
trip_df["weekday"] = trip_df["lpep_pickup_datetime"].dt.weekday
trip_df["month"] = trip_df["lpep_pickup_datetime"].dt.month
# Get calculated duration in minutes
trip_df["lpep_dropoff_datetime"] = pd.to_datetime(trip_df["lpep_dropoff_datetime"])
trip_df["duration_minutes"] = (
trip_df["lpep_dropoff_datetime"] - trip_df["lpep_pickup_datetime"]
).dt.seconds / 60
# Rename and filter cols
trip_df = trip_df.rename(
columns={
"latitude_PU": "pickup_latitude",
"longitude_PU": "pickup_longitude",
"latitude_DO": "dropoff_latitude",
"longitude_DO": "dropoff_longitude",
}
)
trip_df['FS_ID'] = trip_df.index + 1000
current_time_sec = int(round(time.time()))
trip_df["FS_time"] = pd.Series([current_time_sec]*len(trip_df), dtype="float64")
return trip_df