in bigquery_etl/shredder/cost.py [0:0]
def main():
"""Report estimated cost to run shredder."""
args = parser.parse_args()
flat_rate_slots = args.slots
runs_per_year = 365 / args.days
# translate days per run to seconds per run
seconds_per_run = args.days * seconds_per_day
# determine how fast shredder is running
warnings.filterwarnings("ignore", module="google.auth._default")
client = bigquery.Client()
jobs = list(client.query(JOBS_QUERY).result())
table_ids = {sql_table_id(target) for target in DELETE_TARGETS}
tables = [
table
for table in client.query(TABLES_QUERY).result()
if sql_table_id(table) in table_ids
]
flat_rate_tables = [table for table in tables if table.table_id != "main_v4"]
on_demand_tables = [table for table in tables if table.table_id == "main_v4"]
# estimate how much it costs to process flat-rate tables
flat_rate_speeds_and_bytes = [
(get_bytes_per_second(jobs, flat_rate_slots, table), table.num_bytes)
for table in flat_rate_tables
]
# mean weighted by table.num_bytes for tables with jobs
mean_bytes_per_second = mean(flat_rate_speeds_and_bytes)
# calculate using mean_bytes_per_second for tables with no jobs
slots_needed = sum(
num_bytes
/ seconds_per_run
/ (table_bytes_per_second or mean_bytes_per_second)
* flat_rate_slots
for table_bytes_per_second, num_bytes in flat_rate_speeds_and_bytes
)
# slots are reserved in increments of 500
slots_reserved = ceil(slots_needed / 500) * 500
flat_rate_cost = slots_reserved * flat_rate_dollars_per_year_per_slot
flat_rate_num_bytes = sum(table.num_bytes for table in flat_rate_tables)
# estimate how much it costs to process on-demand queries
on_demand_num_bytes = sum(table.num_bytes for table in on_demand_tables)
on_demand_cost = on_demand_num_bytes * runs_per_year / on_demand_bytes_per_dollar
query_cost = flat_rate_cost + on_demand_cost
# estimate how much it increases costs to store data
total_num_bytes = on_demand_num_bytes + flat_rate_num_bytes
added_storage_cost_per_gigabyte_per_year = 0.01 * 12 # $0.01 per GiB
bytes_per_gigabyte = 2**30
storage_cost = (
total_num_bytes * added_storage_cost_per_gigabyte_per_year / bytes_per_gigabyte
)
# estimate how much cost will grow due to retention policies
# days since the start of stable tables
today = datetime.utcnow().date()
yesterday = today - timedelta(days=1)
start_of_stable = datetime(2018, 10, 30).date()
days_in_tables = (today - start_of_stable).days
# normalize 25 month retention to 2 years 1 month with 28 day "months"
days_in_retention = 28 * (13 * 2 + 1)
# estimate how much data will grow before retention kicks in
data_growth = 0
if days_in_tables < days_in_retention:
data_growth = days_in_retention / days_in_tables - 1
# translate data_growth to cost
on_demand_cost_growth = on_demand_cost * data_growth
storage_cost_growth = storage_cost * data_growth
total_cost_growth = on_demand_cost_growth + storage_cost_growth
slots_reserved_growth = (
ceil(slots_needed * (1 + data_growth) / 500) * 500 - slots_reserved
)
flat_rate_growth_msg = (
"$0 is is flat-rate query cost because existing "
"rounded up query capacity should suffice."
)
if slots_reserved_growth > 0:
flat_rate_cost_growth = (
slots_reserved_growth * flat_rate_dollars_per_year_per_slot
)
flat_rate_growth_msg = (
f"${flat_rate_cost_growth:,.0f} is flat-rate query cost for "
f"{slots_reserved_growth} additional reserved slots"
)
total_cost_growth += flat_rate_cost_growth
total_cost = query_cost + storage_cost + total_cost_growth
print(
dedent(
f"""
Cost
===
As of {today}, the cost of this project is estimated to be
${total_cost:,.0f}/year. Of this, ${query_cost:,.0f} is direct cost of
scanning and deleting data, ${storage_cost:,.0f} is increased storage cost
due to active (vs. long-term) storage pricing, and
${total_cost_growth:,.0f} is predicted cost increases as more data is
accumulated and stored.
On-demand queries
---
telemetry_stable.main_v4 contains {on_demand_num_bytes/2**50:.3f} PiB of
data, which is scanned by queries using on-demand pricing at $5 per TiB¹
every {args.days} days, costing ${on_demand_cost:,.0f}/year.
Flat-rate queries
---
The rest of the tables impacted contain {flat_rate_num_bytes/2**50:.3f} PiB
of data, which is scanned by queries using flat-rate pricing at $8,500 per
500 reserved slots¹, and requires approximately {slots_needed:.0f} slots in
order to complete every {args.days} days, rounded up to {slots_reserved}
slots, costing ${flat_rate_cost:,.0f}/year.
Active storage
---
Combined all of the tables contain {total_num_bytes/2**50:.3f} PiB of data,
which is stored using active storage pricing at $0.02 per GiB per month¹
and would otherwise be stored using long-term storage pricing at $0.01 per
GiB per month¹, thus increasing storage cost by ${storage_cost:,.0f}/year.
Data volume growth
---
The above costs total ${query_cost+storage_cost:,.0f}/year, but that does
not account for table sizes increasing over time. If all tables had a
retention policy of 25 months, as of {today} held data from
{start_of_stable} through {yesterday}, and had a stable daily data volume,
then total data volume would increase by {100*data_growth:.0f}% before any
data were dropped due to retention policies.
Assuming a linear impact on queries, this would increase cost by
${total_cost_growth:,.0f}/year. Of this, ${storage_cost_growth:,.0f} is
storage cost, ${on_demand_cost_growth:,.0f} is on-demand query cost, and
{flat_rate_growth_msg}.
¹ https://cloud.google.com/bigquery/pricing#pricing_summary (this doc
currently uses the annual $8500 per 500 slots flat-rate query pricing
rather than the monthly $10000 per 500 slots)
"""
)
)