def main()

in covid19_spread/data/usa/fb/process_mobility.py [0:0]


def main():
    Configuration.create(
        hdx_site="prod", user_agent="A_Quick_Example", hdx_read_only=True
    )
    dataset = Dataset.read_from_hdx("movement-range-maps")
    resources = dataset.get_resources()
    resource = [
        x
        for x in resources
        if re.match(".*/movement-range-data-\d{4}-\d{2}-\d{2}\.zip", x["url"])
    ]
    assert len(resource) == 1
    resource = resource[0]
    url, path = resource.download()
    if os.path.exists(f"{SCRIPT_DIR}/fb_mobility"):
        shutil.rmtree(f"{SCRIPT_DIR}/fb_mobility")
    shutil.unpack_archive(path, f"{SCRIPT_DIR}/fb_mobility", "zip")

    fips_map = get_index()
    fips_map["location"] = fips_map["name"] + ", " + fips_map["subregion1_name"]

    cols = [
        "date",
        "region",
        "all_day_bing_tiles_visited_relative_change",
        "all_day_ratio_single_tile_users",
    ]

    def get_county_mobility_fb(fin):
        df_mobility_global = pd.read_csv(
            fin, parse_dates=["ds"], delimiter="\t", dtype={"polygon_id": str}
        )
        df_mobility_usa = df_mobility_global.query("country == 'USA'")
        return df_mobility_usa

    # fin = sys.argv[1] if len(sys.argv) == 2 else None
    txt_files = glob(f"{SCRIPT_DIR}/fb_mobility/movement-range*.txt")
    assert len(txt_files) == 1
    fin = txt_files[0]
    df = get_county_mobility_fb(fin)
    df = df.rename(columns={"ds": "date", "polygon_id": "region"})

    df = df.merge(fips_map, left_on="region", right_on="fips")[
        list(df.columns) + ["location"]
    ]
    df = df.drop(columns="region").rename(columns={"location": "region"})

    def zscore(df):
        # z-scores
        df = (df.values - df.mean(skipna=True)) / df.std(skipna=True)
        return df

    def process_df(df, cols):
        df = df[cols].copy()
        regions = []
        for (name, _df) in df.groupby("region"):
            _df = _df.sort_values(by="date")
            _df = _df.drop_duplicates(subset="date")
            dates = _df["date"].to_list()
            assert len(dates) == len(np.unique(dates)), _df
            _df = _df.loc[:, ~_df.columns.duplicated()]
            _df = _df.drop(columns=["region", "date"]).transpose()
            # take 7 day average
            _df = _df.rolling(7, min_periods=1, axis=1).mean()
            # convert relative change into absolute numbers
            _df.loc["all_day_bing_tiles_visited_relative_change"] += 1
            _df["region"] = [name] * len(_df)
            _df.columns = list(map(lambda x: x.strftime("%Y-%m-%d"), dates)) + [
                "region"
            ]
            regions.append(_df.reset_index())

        df = pd.concat(regions, axis=0, ignore_index=True)
        cols = ["region"] + [x for x in df.columns if x != "region"]
        df = df[cols]

        df = df.rename(columns={"index": "type"})
        return df

    county = process_df(df, cols)
    state = df.copy()
    state["region"] = state["region"].apply(lambda x: x.split(", ")[-1])
    state = state.groupby(["region", "date"]).mean().reset_index()
    state = process_df(state, cols)
    county = county.fillna(0)
    state = state.fillna(0)
    county.round(4).to_csv(f"{SCRIPT_DIR}/mobility_features_county_fb.csv", index=False)
    state.round(4).to_csv(f"{SCRIPT_DIR}/mobility_features_state_fb.csv", index=False)