jobs/desktop-mobile-mau-2020/desktop_mau/desktop_mau_dau.py (481 lines of code) (raw):

from datetime import timedelta from pathlib import Path import click import matplotlib import numpy as np import pandas as pd from google.cloud import bigquery, bigquery_storage_v1beta1, storage from matplotlib import rcParams from matplotlib import dates as mdates from matplotlib import pyplot as plt pd.options.mode.chained_assignment = None rcParams.update( { "figure.autolayout": True, "figure.dpi": 100, } ) GCS_PREFIX = "desktop-mau-dau-tier1-2020" ROOT_DIR = Path(__file__).parent STATIC_DIR = ROOT_DIR / "static" IMG_DIR = STATIC_DIR / "img" DESKTOP_QUERY = (ROOT_DIR / "mau_dau.sql").read_text() DESKTOP_USER_STATE_QUERY = """ SELECT * FROM `moz-fx-data-derived-datasets.analysis.xluo_kpi_plot_country_new_or_rescurrected_dau` """ def plot_year_over_year(full_dat, country): """Save a plot for MAU and DAU (7d MA) at country level.""" dat = full_dat.loc[full_dat.country == country] plt.style.use("seaborn-white") fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20, 4), sharex="col") metric_list = ["MAU", "mau_pcnt_Jan01", "DAU_MA7d", "dau_pcnt_Jan01"] for num, metric in enumerate(metric_list): axs[num].set(ylim=(dat[metric].min() * 0.95, dat[metric].max() * 1.05)) for year in range(2017, 2021): axs[num].plot( dat.loc[dat["year"] == year, "fakedate"], dat.loc[dat["year"] == year, metric], label=str(year), linestyle="solid", ) axs[num].xaxis.set_major_formatter(mdates.DateFormatter("%b %d")) if num % 2 == 0: axs[num].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter( lambda x, pos: "{:,.0f}".format(x / 1000000) + "MM" ) ) else: axs[num].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter( lambda x, pos: "{:,.0f}".format(x * 100) + "%" ) ) axs[num].legend( ("2017", "2018", "2019", "2020"), title=country + " " + metric, bbox_to_anchor=(0.4, 0.8, 0.6, 0.2), loc="upper left", ncol=4, mode="expand", fontsize=9, ) fig.tight_layout() axs[0].set_title( "Year over Year MAU & DAU(7d MA) in {}".format(country), loc="left", fontsize=18 ) filename = f"desktop_{country}_mau_dau.jpeg" plt.savefig(IMG_DIR / filename) plt.close(fig) return filename def plot_dau_mau_contribution_individual_country(full_dat, country): """ Save a plot with subplots showing the contribution of DAU/MAU w.r.t. global """ new_dat = ( pd.merge( full_dat[full_dat["country"] == country], full_dat[full_dat["country"] == "Global"], on=["date"], ) .rename( columns={ "MAU_x": "MAU", "DAU_x": "DAU", "MAU_y": "MAU_global", "DAU_y": "DAU_global", } ) .sort_values(by="date") ) new_dat["DAU_global_7dMA"] = new_dat["DAU_global"].rolling(window=7).mean() new_dat["DAU_7dMA"] = new_dat["DAU"].rolling(window=7).mean() new_dat["pcnt_MAU"] = new_dat["MAU"] / new_dat["MAU_global"] new_dat["pcnt_DAU"] = new_dat["DAU_7dMA"] / new_dat["DAU_global_7dMA"] plt.style.use("seaborn-white") fig = plt.figure(figsize=(12, 6)) ax = plt.axes(label=f"desktop_{country}_mau_dau_ratio") ax.set(xlim=(pd.to_datetime("20190101"), new_dat["date"].max())) ax.plot( new_dat["date"], new_dat["pcnt_MAU"], color="#0000ff", linestyle="solid", label="MAU", ) ax.plot( new_dat["date"], new_dat["pcnt_DAU"], color="gray", linestyle="dashdot", label="DAU_7dMA", ) plt.title( label="Contribution of MAU/DAU from {} as of {}".format( country, new_dat["date"].max().strftime("%Y-%m-%d") ), loc="left", fontdict={"fontsize": 20, "color": "black"}, ) plt.xlabel("Date", fontsize=16) ax.xaxis.set_major_locator(matplotlib.dates.YearLocator()) ax.xaxis.set_minor_locator( matplotlib.dates.MonthLocator((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)) ) ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("\n%Y")) ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter("%b %d")) ax.yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) plt.setp(ax.get_xticklabels(), rotation=0, ha="center", fontsize=14) plt.setp(ax.get_yticklabels(), rotation=0, ha="right", fontsize=12) ax.legend( bbox_to_anchor=(0.9, 0.9), loc=3, ncol=2, mode="expand", borderaxespad=0.0, fontsize=10, ) ax.grid(which="major", linestyle="-", linewidth="0.5", color="lightgray") filename = f"desktop_{country}_mau_dau_ratio.jpeg" plt.savefig(IMG_DIR / filename) plt.close(fig) return filename def plot_group_contribution(full_data, country_list, cat, metric): full_data = full_data[["date", "country", metric]] if metric == "MAU": dat = pd.merge( full_data[full_data["country"].isin(country_list)], full_data[full_data["country"] == "Global"], on=["date"], ).sort_values(by=["country_x", "date"]) dat["pcnt"] = dat["MAU_x"] / dat["MAU_y"] if metric == "DAU": dat = pd.merge( full_data[full_data["country"].isin(country_list)], full_data[full_data["country"] == "Global"], on=["date"], ).sort_values(by=["country_x", "date"]) dat["DAU_global_7dMA"] = dat["DAU_y"].rolling(window=7).mean() dat["DAU_7dMA"] = dat["DAU_x"].rolling(window=7).mean() dat["pcnt"] = dat["DAU_7dMA"] / dat["DAU_global_7dMA"] plt.style.use("seaborn-white") fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), sharex="col") dat = dat[dat["date"] >= pd.to_datetime("20190101")] # individual for country in country_list: axs[0].plot( dat[dat["country_x"] == country]["date"], dat[dat["country_x"] == country]["pcnt"], label=str(country), linestyle="solid", ) axs[0].set(ylim=(0, 0.2)) axs[0].xaxis.set_major_formatter(mdates.DateFormatter("%Y-%b")) axs[0].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[0].legend( country_list, bbox_to_anchor=(0.9, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) # stacked dat2 = dat[dat["country_x"].isin(country_list)][ ["date", "country_x", "pcnt"] ].pivot(index="date", columns="country_x", values="pcnt") dat2 = pd.DataFrame(data=dat2) y = np.vstack([dat2[x] for x in country_list]) axs[1].stackplot(dat2.index, y, labels=country_list, alpha=0.5) # axs[1].set(ylim=(0, 0.4)) axs[1].xaxis.set_major_formatter(mdates.DateFormatter("%Y-%b")) axs[1].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[1].legend( country_list, bbox_to_anchor=(0.9, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) axs[1].grid(which="major", linestyle="-", linewidth="0.5", color="lightgray") axs[0].set_title( "Contribution of {} from Tier1 as of {} (non-stacked/stacked)".format( metric, dat["date"].max().strftime("%Y-%m-%d") ), loc="left", fontdict={"fontsize": 18, "color": "black"}, ) filename = f"desktop_{metric}_{cat}_contribution.jpeg" plt.savefig(IMG_DIR / filename) plt.close(fig) return filename def plot_dau_user_state_individual_country(full_dat, country): """ Save a plot with subplots showing the contribution of each user state to DAU Data is for this specific country only """ full_dat["dau"] = ( full_dat["dau_regular"] + full_dat["dau_new_or_resurrected"] + full_dat["dau_other"] ) dat = ( full_dat[["date", "dau", "dau_regular", "dau_new_or_resurrected", "dau_other"]] .melt(id_vars=["date", "dau"]) .rename(columns={"variable": "user_state"}) .sort_values("date") ) dat["user_state"] = dat["user_state"].str.replace("dau_", "") dat["pcnt"] = dat["value"] / dat["dau"] plt.style.use("seaborn-white") fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(18, 4), sharex="col") dat = dat[dat["date"] >= dat["date"].max() + timedelta(-182)] cat_list = ["new_or_resurrected", "regular", "other"] # individual for cat in cat_list: axs[0].plot( dat[dat["user_state"] == cat]["date"], dat[dat["user_state"] == cat]["pcnt"], label=str(cat), linestyle="solid", ) axs[0].set(ylim=(0, 1)) axs[0].xaxis.set_major_formatter(mdates.DateFormatter("%y-%b")) axs[0].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[0].legend( cat_list, bbox_to_anchor=(0.5, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) # stacked dat2 = dat[dat["user_state"].isin(cat_list)][["date", "user_state", "pcnt"]].pivot( index="date", columns="user_state", values="pcnt" ) dat2 = pd.DataFrame(data=dat2) y = np.vstack([dat2[x] for x in cat_list]) axs[1].stackplot(dat2.index, y, labels=cat_list, alpha=0.5) # axs[1].set(ylim=(0, 0.4)) axs[1].xaxis.set_major_formatter(mdates.DateFormatter("%y-%b")) axs[1].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) # New only, as it"s small cat = "new_or_resurrected" axs[2].plot( dat[dat["user_state"] == cat]["date"], dat[dat["user_state"] == cat]["pcnt"], label=str(cat), linestyle="solid", ) axs[2].xaxis.set_major_formatter(mdates.DateFormatter("%y-%b")) axs[2].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[2].legend( bbox_to_anchor=(0.5, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) # Regular only cat = "regular" axs[3].plot( dat[dat["user_state"] == cat]["date"], dat[dat["user_state"] == cat]["pcnt"], label=str(cat), linestyle="solid", color="orange", ) axs[3].xaxis.set_major_formatter(mdates.DateFormatter("%y-%b")) axs[3].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[3].legend( bbox_to_anchor=(0.6, 0.1, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) axs[0].set_title( "DAU per User State (individual/stacked) from {}".format(country), loc="left", fontdict={"fontsize": 18, "color": "black"}, ) filename = f"desktop_{country}_user_state_dau_contribution.jpeg" plt.savefig(IMG_DIR / filename) plt.close(fig) return filename def plot_dau_mau_ratio(desktop_data): ind_dat = pd.merge( desktop_data, desktop_data[desktop_data["country"] == "Global"], on=["date"] ).sort_values(by=["country_x", "date"]) ind_dat["DAU_global_7dMA"] = ind_dat["DAU_y"].rolling(window=7).mean() ind_dat["DAU_7dMA"] = ind_dat["DAU_x"].rolling(window=7).mean() ind_dat["pcnt_MAU"] = ind_dat["MAU_x"] / ind_dat["MAU_y"] ind_dat["pcnt_DAU"] = ind_dat["DAU_7dMA"] / ind_dat["DAU_global_7dMA"] ind_dat = ind_dat[ind_dat["date"] >= pd.to_datetime("20190101")] plt.style.use("seaborn-white") fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), sharex="col") country_list = ["US", "CA", "DE", "FR", "GB"] metric_list = ["MAU", "DAU"] for i, metric in enumerate(metric_list): for country in country_list: axs[i].plot( ind_dat[ind_dat["country_x"] == country]["date"], ind_dat[ind_dat["country_x"] == country]["pcnt_" + metric], label=str(metric), linestyle="solid", ) axs[i].set(ylim=(0, 0.20)) axs[i].xaxis.set_major_formatter(mdates.DateFormatter("%Y-%b")) axs[i].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter( lambda x, pos: "{:,.0f}".format(x * 100) + "%" ) ) axs[i].legend( country_list, bbox_to_anchor=(0.9, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) axs[i].grid(which="major", linestyle="-", linewidth="0.5", color="lightgray") axs[0].set_title( "Contribution of MAU/DAU per Tier1 country as of " + ind_dat["date"].max().strftime("%Y-%m-%d"), loc="left", fontdict={"fontsize": 18, "color": "black"}, ) filename_1 = "desktop_mau_dau_ratio.jpeg" plt.savefig(IMG_DIR / filename_1) plt.close(fig) ind_dat = pd.merge( desktop_data[desktop_data["country"].isin(["US", "CA", "DE", "FR", "GB"])], desktop_data[desktop_data["country"] == "Global"], on=["date"], ) ind_dat["pcnt_MAU"] = ind_dat["MAU_x"] / ind_dat["MAU_y"] ind_dat = ind_dat[ind_dat["date"] >= pd.to_datetime("20190101")] plt.style.use("seaborn-white") fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), sharex="col") metric = "MAU" country_list = ["US", "CA", "DE", "FR", "GB"] for country in country_list: axs[0].plot( ind_dat[ind_dat["country_x"] == country]["date"], ind_dat[ind_dat["country_x"] == country]["pcnt_" + metric], label=str(metric), linestyle="solid", ) axs[0].set(ylim=(0, 0.20)) axs[0].xaxis.set_major_formatter(mdates.DateFormatter("%Y-%b")) axs[0].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[0].legend( country_list, bbox_to_anchor=(0.9, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) axs[0].grid(which="major", linestyle="-", linewidth="0.5", color="lightgray") axs[0].set_title( "Contribution of MAU/DAU per Tier1 country as of " + ind_dat["date"].max().strftime("%Y-%m-%d"), loc="left", fontdict={"fontsize": 18, "color": "black"}, ) # axs[1].plot.area([ind_dat["country_x"]\ # .isin(country_list)]["date", "country_x", "pcnt"], subplots=True) dat2 = ind_dat[ind_dat["country_x"].isin(country_list)][ ["date", "country_x", "pcnt_MAU"] ].pivot(index="date", columns="country_x", values="pcnt_MAU") dat2 = pd.DataFrame(data=dat2) y = np.vstack([dat2[x] for x in country_list]) axs[1].stackplot(dat2.index, y, labels=country_list, alpha=0.5) # axs[1].set(ylim=(0, 0.4)) axs[1].xaxis.set_major_formatter(mdates.DateFormatter("%Y-%b")) axs[1].yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x * 100) + "%") ) axs[1].legend( country_list, bbox_to_anchor=(0.9, 0.8, 0.2, 0.2), loc="upper left", ncol=1, mode="expand", fontsize=10, ) axs[1].grid(which="major", linestyle="-", linewidth="0.5", color="lightgray") filename_2 = "desktop_mau_dau_ratio_2.jpeg" plt.savefig(IMG_DIR / filename_2) plt.close(fig) return filename_1, filename_2 def fetch_data(project): bq_client = bigquery.Client(project=project) bq_storage_client = bigquery_storage_v1beta1.BigQueryStorageClient() IMG_DIR.mkdir(exist_ok=True) desktop_data = ( bq_client.query(DESKTOP_QUERY) .result() .to_dataframe(bqstorage_client=bq_storage_client) ) desktop_user_state_data = ( bq_client.query(DESKTOP_USER_STATE_QUERY) .result() .to_dataframe(bqstorage_client=bq_storage_client) ) desktop_data = pd.merge( desktop_data, desktop_user_state_data, on=["date", "country"] ) desktop_data["year"] = pd.DatetimeIndex(desktop_data["date"]).year desktop_data["doy"] = pd.DatetimeIndex(desktop_data["date"]).dayofyear desktop_data["fakedate"] = [ pd.to_datetime("20170101") + timedelta(days=x) for x in desktop_data["doy"] ] desktop_data["DAU_MA7d"] = desktop_data.groupby("country")["DAU"].transform( lambda x: x.rolling(window=7).mean() ) desktop_data["MAU_base"] = desktop_data.groupby(["country", "year"]).MAU.transform( "first" ) desktop_data["DAU_MA7d_base"] = desktop_data.groupby( ["country", "year"] ).DAU_MA7d.transform("first") desktop_data["dau_pcnt_Jan01"] = ( desktop_data["DAU_MA7d"] / desktop_data["DAU_MA7d_base"] ) desktop_data["mau_pcnt_Jan01"] = desktop_data["MAU"] / desktop_data["MAU_base"] return desktop_data def generate_plots(project): desktop_data = fetch_data(project) tier1 = ["US", "CA", "DE", "FR", "GB"] top11 = ["US", "CA", "DE", "FR", "GB", "CN", "IN", "ID", "BR", "RU", "PL"] country_groups = ["Global", "Tier1", "RoW"] for country in top11 + country_groups: # desktop_{country}_mau_dau.jpeg filename = plot_year_over_year(desktop_data, country) print(f"Created {filename}") # desktop_{country}_mau_dau_ratio.jpeg filename = plot_dau_mau_contribution_individual_country(desktop_data, country) print(f"Created {filename}") # desktop_MAU_tier1_contribution.jpeg plot_group_contribution(desktop_data, tier1, "Tier1", "MAU") # desktop_DAU_tier1_contribution.jpeg plot_group_contribution(desktop_data, tier1, "Tier1", "DAU") # desktop_MAU_top11_contribution.jpeg plot_group_contribution(desktop_data, top11, "Top11", "MAU") # desktop_DAU_top11_contribution.jpeg plot_group_contribution(desktop_data, top11, "Top11", "DAU") for country in top11: # desktop_{country}_user_state_dau_contribution.jpeg plot_dau_user_state_individual_country( desktop_data[desktop_data["country"] == country], country ) # desktop_mau_dau_ratio.jpeg # desktop_mau_dau_ratio_2.jpeg plot_dau_mau_ratio(desktop_data) def upload_files(project, bucket_name): storage_client = storage.Client(project=project) bucket = storage_client.bucket(bucket_name=bucket_name) for pathname in STATIC_DIR.rglob("*"): if pathname.is_file(): blob = bucket.blob(str(Path(GCS_PREFIX) / pathname.relative_to(STATIC_DIR))) blob.upload_from_filename(pathname) @click.command() @click.option("--project", help="GCP project id", required=True) @click.option("--bucket-name", help="GCP bucket name") def main(project, bucket_name): generate_plots(project) if bucket_name is not None: upload_files(project, bucket_name) if __name__ == "__main__": main()