in 1. Amazon SageMaker Processing/preprocess.py [0:0]
def clean_data(trip_df: pd.DataFrame):
# Remove outliers
trip_df = trip_df[
(trip_df.fare_amount > 0)
& (trip_df.fare_amount < 200)
& (trip_df.passenger_count > 0)
& (trip_df.duration_minutes > 0)
& (trip_df.duration_minutes < 120)
& (trip_df.geo_distance > 0)
& (trip_df.geo_distance < 121)
].dropna()
# Filter columns
cols = [
"fare_amount",
"passenger_count",
"pickup_latitude",
"pickup_longitude",
"dropoff_latitude",
"dropoff_longitude",
"geo_distance",
"hour",
"weekday",
"month",
]
cols_fg = [
"fare_amount",
"passenger_count",
"pickup_latitude",
"pickup_longitude",
"dropoff_latitude",
"dropoff_longitude",
"geo_distance",
"hour",
"weekday",
"month",
"FS_ID",
"FS_time"
]
return trip_df[cols], trip_df[cols_fg]