in covid19_spread/data/usa/process_cases.py [0:0]
def get_nyt(metric="cases"):
print("NYT")
data_repo = update_repo("https://github.com/nytimes/covid-19-data.git")
df = pandas.read_csv(
os.path.join(data_repo, "us-counties.csv"), dtype={"fips": str}
)
index = get_index()
df = df.merge(index[["fips", "subregion1_name", "name"]], on="fips")
df["loc"] = df["subregion1_name"] + "_" + df["name"]
pivot = df.pivot_table(values=metric, columns=["loc"], index="date")
pivot = pivot.fillna(0)
pivot.index = pandas.to_datetime(pivot.index)
if metric == "deaths":
return pivot
# Swap out NYTimes NY state data with the NY DOH data.
NYSTATE_URL = (
"https://health.data.ny.gov/api/views/xdss-u53e/rows.csv?accessType=DOWNLOAD"
)
df = pandas.read_csv(NYSTATE_URL).rename(
columns={"Test Date": "date", "Cumulative Number of Positives": "cases"}
)
df["loc"] = "New York_" + df["County"]
df = df.pivot_table(values=metric, columns=["loc"], index="date")
df.columns = [x + " County" for x in df.columns]
# The NYT labels each date as the date the report comes out, not the date the data corresponds to.
# Add 1 day to the NYS DOH data to get it to align
df.index = pandas.to_datetime(df.index) + datetime.timedelta(days=1)
without_nystate = pivot[[c for c in pivot.columns if not c.startswith("New York")]]
last_date = min(without_nystate.index.max(), df.index.max())
df = df[df.index <= last_date]
without_nystate = without_nystate[without_nystate.index <= last_date]
assert (
df.index.max() == without_nystate.index.max()
), "NYT and DOH data don't matchup yet!"
# Only take NYT data up to the date for which we have nystate data
without_nystate[without_nystate.index <= df.index.max()]
return without_nystate.merge(
df, left_index=True, right_index=True, how="outer"
).fillna(0)