def main()

in bigquery_etl/shredder/cost.py [0:0]


def main():
    """Report estimated cost to run shredder."""
    args = parser.parse_args()
    flat_rate_slots = args.slots
    runs_per_year = 365 / args.days

    # translate days per run to seconds per run
    seconds_per_run = args.days * seconds_per_day

    # determine how fast shredder is running
    warnings.filterwarnings("ignore", module="google.auth._default")
    client = bigquery.Client()
    jobs = list(client.query(JOBS_QUERY).result())

    table_ids = {sql_table_id(target) for target in DELETE_TARGETS}
    tables = [
        table
        for table in client.query(TABLES_QUERY).result()
        if sql_table_id(table) in table_ids
    ]
    flat_rate_tables = [table for table in tables if table.table_id != "main_v4"]
    on_demand_tables = [table for table in tables if table.table_id == "main_v4"]

    # estimate how much it costs to process flat-rate tables
    flat_rate_speeds_and_bytes = [
        (get_bytes_per_second(jobs, flat_rate_slots, table), table.num_bytes)
        for table in flat_rate_tables
    ]
    # mean weighted by table.num_bytes for tables with jobs
    mean_bytes_per_second = mean(flat_rate_speeds_and_bytes)
    # calculate using mean_bytes_per_second for tables with no jobs
    slots_needed = sum(
        num_bytes
        / seconds_per_run
        / (table_bytes_per_second or mean_bytes_per_second)
        * flat_rate_slots
        for table_bytes_per_second, num_bytes in flat_rate_speeds_and_bytes
    )
    # slots are reserved in increments of 500
    slots_reserved = ceil(slots_needed / 500) * 500
    flat_rate_cost = slots_reserved * flat_rate_dollars_per_year_per_slot
    flat_rate_num_bytes = sum(table.num_bytes for table in flat_rate_tables)

    # estimate how much it costs to process on-demand queries
    on_demand_num_bytes = sum(table.num_bytes for table in on_demand_tables)
    on_demand_cost = on_demand_num_bytes * runs_per_year / on_demand_bytes_per_dollar

    query_cost = flat_rate_cost + on_demand_cost

    # estimate how much it increases costs to store data
    total_num_bytes = on_demand_num_bytes + flat_rate_num_bytes
    added_storage_cost_per_gigabyte_per_year = 0.01 * 12  # $0.01 per GiB
    bytes_per_gigabyte = 2**30
    storage_cost = (
        total_num_bytes * added_storage_cost_per_gigabyte_per_year / bytes_per_gigabyte
    )

    # estimate how much cost will grow due to retention policies
    # days since the start of stable tables
    today = datetime.utcnow().date()
    yesterday = today - timedelta(days=1)
    start_of_stable = datetime(2018, 10, 30).date()
    days_in_tables = (today - start_of_stable).days
    # normalize 25 month retention to 2 years 1 month with 28 day "months"
    days_in_retention = 28 * (13 * 2 + 1)
    # estimate how much data will grow before retention kicks in
    data_growth = 0
    if days_in_tables < days_in_retention:
        data_growth = days_in_retention / days_in_tables - 1
    # translate data_growth to cost
    on_demand_cost_growth = on_demand_cost * data_growth
    storage_cost_growth = storage_cost * data_growth
    total_cost_growth = on_demand_cost_growth + storage_cost_growth
    slots_reserved_growth = (
        ceil(slots_needed * (1 + data_growth) / 500) * 500 - slots_reserved
    )
    flat_rate_growth_msg = (
        "$0 is is flat-rate query cost because existing "
        "rounded up query capacity should suffice."
    )
    if slots_reserved_growth > 0:
        flat_rate_cost_growth = (
            slots_reserved_growth * flat_rate_dollars_per_year_per_slot
        )
        flat_rate_growth_msg = (
            f"${flat_rate_cost_growth:,.0f} is flat-rate query cost for "
            f"{slots_reserved_growth} additional reserved slots"
        )
        total_cost_growth += flat_rate_cost_growth

    total_cost = query_cost + storage_cost + total_cost_growth

    print(
        dedent(
            f"""
            Cost
            ===

            As of {today}, the cost of this project is estimated to be
            ${total_cost:,.0f}/year. Of this, ${query_cost:,.0f} is direct cost of
            scanning and deleting data, ${storage_cost:,.0f} is increased storage cost
            due to active (vs. long-term) storage pricing, and
            ${total_cost_growth:,.0f} is predicted cost increases as more data is
            accumulated and stored.

            On-demand queries
            ---

            telemetry_stable.main_v4 contains {on_demand_num_bytes/2**50:.3f} PiB of
            data, which is scanned by queries using on-demand pricing at $5 per TiB¹
            every {args.days} days, costing ${on_demand_cost:,.0f}/year.

            Flat-rate queries
            ---

            The rest of the tables impacted contain {flat_rate_num_bytes/2**50:.3f} PiB
            of data, which is scanned by queries using flat-rate pricing at $8,500 per
            500 reserved slots¹, and requires approximately {slots_needed:.0f} slots in
            order to complete every {args.days} days, rounded up to {slots_reserved}
            slots, costing ${flat_rate_cost:,.0f}/year.

            Active storage
            ---

            Combined all of the tables contain {total_num_bytes/2**50:.3f} PiB of data,
            which is stored using active storage pricing at $0.02 per GiB per month¹
            and would otherwise be stored using long-term storage pricing at $0.01 per
            GiB per month¹, thus increasing storage cost by ${storage_cost:,.0f}/year.

            Data volume growth
            ---

            The above costs total ${query_cost+storage_cost:,.0f}/year, but that does
            not account for table sizes increasing over time. If all tables had a
            retention policy of 25 months, as of {today} held data from
            {start_of_stable} through {yesterday}, and had a stable daily data volume,
            then total data volume would increase by {100*data_growth:.0f}% before any
            data were dropped due to retention policies.

            Assuming a linear impact on queries, this would increase cost by
            ${total_cost_growth:,.0f}/year. Of this, ${storage_cost_growth:,.0f} is
            storage cost, ${on_demand_cost_growth:,.0f} is on-demand query cost, and
            {flat_rate_growth_msg}.


            ¹ https://cloud.google.com/bigquery/pricing#pricing_summary (this doc
            currently uses the annual $8500 per 500 slots flat-rate query pricing
            rather than the monthly $10000 per 500 slots)
            """
        )
    )