in covid19_spread/data/usa/google/process_open_data.py [0:0]
def main():
index = pandas.read_csv(
"https://storage.googleapis.com/covid19-open-data/v2/index.csv"
)
state_index = index[(index["key"].str.match("^US_[A-Z]+$")).fillna(False)]
index = get_index()
def zscore(piv):
# z-zcore
piv = (piv - piv.mean(skipna=True)) / piv.std(skipna=True)
piv = piv.fillna(method="ffill").fillna(method="bfill")
# piv = piv.fillna(0)
return piv
def zero_one(df):
df = df.fillna(0)
# df = df.div(df.max(axis=0), axis=1)
df = df / df.max(axis=0)
df = df.fillna(0)
return df
def process_df(df, columns, resolution, func_normalize):
idx = state_index if resolution == "state" else index
merged = df.merge(idx, on="key")
if resolution == "state":
exclude = {"US_MP", "US_AS", "US_GU", "US_VI", "US_PR"}
merged = merged[~merged["key"].isin(exclude)]
merged["region"] = merged["subregion1_name"]
else:
merged["region"] = merged["name"] + ", " + merged["subregion1_name"]
piv = merged.pivot(index="date", columns="region", values=columns)
if func_normalize is not None:
piv = func_normalize(piv)
dfs = []
for k in piv.columns.get_level_values(0).unique():
dfs.append(piv[k].transpose())
dfs[-1]["type"] = k
df = pandas.concat(dfs)
df = df[["type"] + [c for c in df.columns if isinstance(c, datetime)]]
df.columns = [
str(c.date()) if isinstance(c, datetime) else c for c in df.columns
]
return df.fillna(0) # in case all values are NaN
def get_df(url):
if "weather" in url:
# This dataset is quite large. Iterate in chunks, and filter out non-US rows
chunks = []
for chunk in pandas.read_csv(url, parse_dates=["date"], chunksize=200000):
chunks.append(
chunk[~chunk["key"].isnull() & chunk["key"].str.startswith("US")]
)
df = pandas.concat(chunks)
else:
df = pandas.read_csv(url, parse_dates=["date"])
return df[~df["key"].isnull() & df["key"].str.startswith("US")]
def do_feature(url, columns, resolution, func_normalize, outfile):
print(f"Fetching {url}")
df = get_df(url)
vaccination = process_df(
df, columns=columns, resolution=resolution, func_normalize=func_normalize
)
vaccination = vaccination.reset_index().set_index(["region", "type"])
vaccination.to_csv(outfile, index_label=["region", "type"])
# --- Vaccination data ---
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/vaccinations.csv",
columns=["new_persons_vaccinated", "total_persons_vaccinated"],
resolution="state",
func_normalize=zero_one,
outfile=os.path.join(SCRIPT_DIR, "vaccination_state.csv"),
)
# --- Hospitalizations ---
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/hospitalizations.csv",
columns=[
"current_hospitalized",
"current_intensive_care",
"current_ventilator",
],
resolution="state",
func_normalize=lambda x: zero_one(x.clip(0, None)),
outfile=os.path.join(SCRIPT_DIR, "hosp_features_state.csv"),
)
# --- Weather features ---
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/weather.csv",
columns=[
"average_temperature",
"minimum_temperature",
"maximum_temperature",
"rainfall",
"relative_humidity",
"dew_point",
],
resolution="state",
func_normalize=zscore,
outfile=os.path.join(SCRIPT_DIR, "weather_features_state.csv"),
)
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/weather.csv",
columns=[
"average_temperature",
"minimum_temperature",
"maximum_temperature",
"rainfall",
"relative_humidity",
"dew_point",
],
resolution="county",
func_normalize=zscore,
outfile=os.path.join(SCRIPT_DIR, "weather_features_county.csv"),
)
# --- Epi features ---
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv",
columns=["new_confirmed"],
resolution="state",
func_normalize=lambda x: zero_one(x.clip(0, None)),
outfile=os.path.join(SCRIPT_DIR, "epi_features_state.csv"),
)
do_feature(
url="https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv",
columns=["new_confirmed"],
resolution="county",
func_normalize=lambda x: zero_one(x.clip(0, None)),
outfile=os.path.join(SCRIPT_DIR, "epi_features_county.csv"),
)
# ---- Testing -----
print("Getting Google testing data...")
df = get_df("https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv")
testing = process_df(
df,
columns=["new_tested"],
resolution="state",
func_normalize=lambda x: zero_one(x.clip(0, None)),
)
testing.round(3).to_csv(f"{SCRIPT_DIR}/tested_total_state.csv")
df["ratio"] = df["new_confirmed"] / df["new_tested"]
testing = process_df(
df, columns=["ratio"], resolution="state", func_normalize=None,
)
testing.round(3).to_csv(f"{SCRIPT_DIR}/tested_ratio_state.csv")