contrib/scrape-ec2-prices.py

#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import re import copy import json import time import atexit from collections import OrderedDict, defaultdict import tqdm # pylint: disable=import-error import ijson # pylint: disable=import-error import requests # Buffer size for ijson.parse() function. Larger buffer size results in increased memory # consumption, but faster parsing. IJSON_BUF_SIZE = 30 * 65536 # same URL as the one used by scrape-ec2-sizes.py, now it has official data on pricing URL = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json" RE_NUMERIC_OTHER = re.compile(r"(?:([0-9]+)|([-A-Z_a-z]+)|([^-0-9A-Z_a-z]+))") BASE_PATH = os.path.dirname(os.path.abspath(__file__)) PRICING_FILE_PATH = os.path.join(BASE_PATH, "../libcloud/data/pricing.json") PRICING_FILE_PATH = os.path.abspath(PRICING_FILE_PATH) FILEPATH = os.environ.get("TMP_JSON", "/tmp/ec.json") INSTANCE_SIZES = [ "micro", "small", "medium", "large", "xlarge", "x-large", "extra-large", ] def download_json(): if os.path.isfile(FILEPATH): mtime_str = time.strftime("%Y-%m-%d %H:%I:%S UTC", time.gmtime(os.path.getmtime(FILEPATH))) print("Using data from existing cached file {} (mtime={})".format(FILEPATH, mtime_str)) return open(FILEPATH), True def remove_partial_cached_file(): if os.path.isfile(FILEPATH): os.remove(FILEPATH) # File not cached locally, download data and cache it with requests.get(URL, stream=True) as response: atexit.register(remove_partial_cached_file) total_size_in_bytes = int(response.headers.get("content-length", 0)) progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) chunk_size = 10 * 1024 * 1024 with open(FILEPATH, "wb") as fp: # NOTE: We use shutil.copyfileobj with large chunk size instead of # response.iter_content with large chunk size since data we # download is massive and copyfileobj is more efficient. # shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024) for chunk_data in response.iter_content(chunk_size): progress_bar.update(len(chunk_data)) fp.write(chunk_data) progress_bar.close() atexit.unregister(remove_partial_cached_file) return FILEPATH, False def get_json(): if not os.path.isfile(FILEPATH): return download_json()[0], False mtime_str = time.strftime("%Y-%m-%d %H:%I:%S UTC", time.gmtime(os.path.getmtime(FILEPATH))) print("Using data from existing cached file {} (mtime={})".format(FILEPATH, mtime_str)) return FILEPATH, True # Prices and sizes are in different dicts and categorized by sku def get_all_prices(): # return variable # prices = {sku : {price: int, unit: string}} prices = {} current_sku = "" current_rate_code = "" amazonEC2_offer_code = "JRTCKXETXF" json_file, from_file = get_json() with open(json_file) as f: print("Starting to parse pricing data, this could take up to 15 minutes...") parser = ijson.parse(f, buf_size=IJSON_BUF_SIZE) # use parser because file is very large for prefix, event, value in tqdm.tqdm(parser): if "products" in prefix: continue if (prefix, event) == ("terms.OnDemand", "map_key"): current_sku = value prices[current_sku] = {} elif (prefix, event) == ( f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions", "map_key", ): current_rate_code = value elif (prefix, event) == ( f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions" f".{current_rate_code}.unit", "string", ): prices[current_sku]["unit"] = value elif (prefix, event) == ( f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions" f".{current_rate_code}.pricePerUnit.USD", "string", ): prices[current_sku]["price"] = value return prices # For each combination of location - size - os the file has a different sku. # For each sku we have a price def scrape_ec2_pricing(): skus = {} prices = get_all_prices() json_file, from_file = get_json() with open(json_file) as f: print("Starting to parse pricing data, this could take up to 15 minutes...") # use parser because file is very large parser = ijson.parse(f, buf_size=IJSON_BUF_SIZE) current_sku = "" for prefix, event, value in tqdm.tqdm(parser): if "terms" in prefix: break if (prefix, event) == ("products", "map_key"): current_sku = value skus[current_sku] = {"sku": value} elif (prefix, event) == (f"products.{current_sku}.productFamily", "string"): skus[current_sku]["family"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.location", "string", ): skus[current_sku]["locationName"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.locationType", "string", ): skus[current_sku]["locationType"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.instanceType", "string", ): skus[current_sku]["size"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.operatingSystem", "string", ): skus[current_sku]["os"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.usagetype", "string", ): skus[current_sku]["usage_type"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.preInstalledSw", "string", ): skus[current_sku]["preInstalledSw"] = value elif (prefix, event) == ( f"products.{current_sku}.attributes.regionCode", "string", ): skus[current_sku]["location"] = value # only get prices of compute instances atm elif (prefix, event) == (f"products.{current_sku}", "end_map"): if ( "Compute Instance" not in skus[current_sku]["family"] and "Dedicated Host" not in skus[current_sku]["family"] ): del skus[current_sku] ec2_linux = defaultdict(OrderedDict) ec2_windows = defaultdict(OrderedDict) ec2_rhel = defaultdict(OrderedDict) ec2_rhel_ha = defaultdict(OrderedDict) ec2_suse = defaultdict(OrderedDict) os_map = { "Linux": ec2_linux, "Windows": ec2_windows, "RHEL": ec2_rhel, "SUSE": ec2_suse, "Red Hat Enterprise Linux with HA": ec2_rhel_ha, } for sku in skus: if skus[sku]["locationType"] != "AWS Region": continue # skip any SQL if skus[sku]["preInstalledSw"] != "NA": continue os = skus[sku]["os"] if os == "NA": continue os_dict = os_map.get(os) # new OS, until it is documented skip it if os_dict is None: print(f"Unexpected OS {os}") continue size = skus[sku]["size"] location = skus[sku]["location"] # size is first seen if not os_dict.get(size): os_dict[size] = {} # if price already exists pick the BoxUsage usage type which means on demand if os_dict.get(size, {}).get(location) and "BoxUsage" not in skus[sku]["usage_type"]: continue # if price is not a number then label it as not available try: price = float(prices[sku]["price"]) os_dict[size][location] = price except ValueError: os_dict[size][location] = "n/a" except KeyError: # size is available only reserved del os_dict[size] return { "ec2_linux": ec2_linux, "ec2_windows": ec2_windows, "ec2_rhel": ec2_rhel, "ec2_suse": ec2_suse, "ec2_rhel_ha": ec2_rhel_ha, } def update_pricing_file(pricing_file_path, pricing_data): with open(pricing_file_path) as fp: content = fp.read() data = json.loads(content) original_data = copy.deepcopy(data) data["compute"].update(pricing_data) if data == original_data: # Nothing has changed, bail out early and don't update "updated" attribute print("Nothing has changed, skipping update.") return data["updated"] = int(time.time()) # Always sort the pricing info data = sort_nested_dict(data) content = json.dumps(data, indent=4) lines = content.splitlines() lines = [line.rstrip() for line in lines] content = "\n".join(lines) with open(pricing_file_path, "w") as fp: fp.write(content) def sort_nested_dict(value): """ Recursively sort a nested dict. """ result = OrderedDict() for key, value in sorted(value.items(), key=sort_key_by_numeric_other): if isinstance(value, (dict, OrderedDict)): result[key] = sort_nested_dict(value) else: result[key] = value return result def sort_key_by_numeric_other(key_value): """ Split key into numeric, alpha and other part and sort accordingly. """ result = [] for numeric, alpha, other in RE_NUMERIC_OTHER.findall(key_value[0]): numeric = int(numeric) if numeric else -1 alpha = INSTANCE_SIZES.index(alpha) if alpha in INSTANCE_SIZES else alpha alpha = str(alpha) item = tuple([numeric, alpha, other]) result.append(item) return tuple(result) def main(): print( "Scraping EC2 pricing data (if this runs for the first time " "it has to download a 3GB file, depending on your bandwith " "it might take a while)...." ) pricing_data = scrape_ec2_pricing() update_pricing_file(pricing_file_path=PRICING_FILE_PATH, pricing_data=pricing_data) print("Pricing data updated") if __name__ == "__main__": main()

contrib/scrape-ec2-prices.py (231 lines of code) (raw):