def process()

in data/download_and_preprocess_citibike.py [0:0]


def process(root="", year=2019):

    np.random.seed(0)

    dfs = []
    for month in tqdm(range(1, 13)):
        filepath = os.path.join(root, "citibike", "raw", f"{year}{month:02d}-citibike-tripdata.csv")
        dfs.append(pd.read_csv(filepath))
    df = pd.concat(dfs)

    stds = df.std(0)
    std_x = stds["start station longitude"]
    std_y = stds["start station latitude"]

    df["starttime"] = pd.to_datetime(df["starttime"])
    timedelta = pd.Timedelta(hours=5)
    df["starttime"] = df["starttime"] - timedelta

    df_by_day = []
    for group in df.groupby(pd.DatetimeIndex(df["starttime"]).date):
        df_by_day.append(group[1])

    sequences = {}
    for _, df in enumerate(df_by_day[3:-1]):

        starttime = df["starttime"]

        year = pd.DatetimeIndex(starttime).year[0]
        month = pd.DatetimeIndex(starttime).month[0]
        day = pd.DatetimeIndex(starttime).day[0]

        t = (
            pd.DatetimeIndex(starttime).hour * 60 * 60 +
            pd.DatetimeIndex(starttime).minute * 60 +
            pd.DatetimeIndex(starttime).second +
            pd.DatetimeIndex(starttime).microsecond * 1e-6
        )
        t = np.array(t) / 60 / 60

        y = df["start station latitude"]
        x = df["start station longitude"]

        x = np.array(x)
        y = np.array(y)

        seq = np.stack([t, x, y], axis=1)
        seq_name = f"{year}{month:02d}" + f"{day:02d}"

        print(seq_name)

        for i in range(20):
            # subsample_idx = np.sort(np.random.choice(seq.shape[0], seq.shape[0] // 500, replace=False))
            subsample_idx = np.random.rand(seq.shape[0]) < (1 / 500)
            while np.sum(subsample_idx) == 0:
                subsample_idx = np.random.rand(seq.shape[0]) < (1 / 500)

            sequences[seq_name + f"_{i:03d}"] = add_spatial_noise(seq[subsample_idx], std=[0., std_x * 0.02, std_y * 0.02])

            print(np.sum(subsample_idx))

    np.savez("citibike/citibike.npz", **sequences)