scripts/health_check.py (156 lines of code) (raw):

#!/usr/bin/env python3 import argparse import http.client import json import time import urllib.error import urllib.request import urllib.response from enum import Enum ENV_ENUM = Enum( 'ENV', [ ('dev', 'https://addons-dev.allizom.org'), ('stage', 'https://addons.allizom.org'), ('prod', 'https://addons.mozilla.org'), # For local environments hit the nginx container as set in docker-compose.yml ('container', 'http://nginx'), ('host', 'http://127.0.0.1:80'), ], ) class Fetcher: def __init__( self, env: ENV_ENUM, verbose: bool = False, retries: int = 5, backoff_factor: float = 0.1, status_forcelist: list[int] = None, ): self.environment = ENV_ENUM[env] self.verbose = verbose self.retries = retries self.backoff_factor = backoff_factor self.status_forcelist = status_forcelist or [502, 503, 504] self.timeout = 10 def log(self, *args): if self.verbose: print(*args) def _response(self, response): raw_data = response.read() encoding = response.info().get_content_charset('utf-8') decoded_data = raw_data.decode(encoding) data = json.loads(decoded_data) self.log(json.dumps(data, indent=2)) return {'url': response.url, 'data': data} def _fetch(self, path: str): url = f'{self.environment.value}/{path}' last_exception = None for attempt in range(1, self.retries + 1): self.log( f'Attempt {attempt}/{self.retries}: ' f'Requesting {url} for {self.environment.name}' ) try: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=self.timeout) as response: return self._response(response) except ( urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected, TimeoutError, ) as e: last_exception = e should_retry = False log_reason = '' if isinstance(e, urllib.error.HTTPError): log_reason = f'status {e.code}' try: self.log( f'Request failed with {log_reason}, ' 'attempting to parse error response body.' ) return self._response(e) except ( json.decoder.JSONDecodeError, UnicodeDecodeError, ) as parse_error: self.log(f'Failed to parse error response body: {parse_error}') if e.code in self.status_forcelist and attempt < self.retries: should_retry = True else: log_reason = str(e) if attempt < self.retries: should_retry = True if should_retry: wait_time = self.backoff_factor * (2**attempt) self.log( f'Request failed with {log_reason}. ' f'Retrying in {wait_time:.2f} seconds...' ) time.sleep(wait_time) continue else: self.log( f'Request failed with {log_reason}. ' 'No more retries or not retryable.' ) raise e except Exception as e: last_exception = e self.log(f'An unexpected error occurred: {e}. No more retries.') raise e raise last_exception or RuntimeError( f'Failed to fetch {url} after {self.retries + 1} attempts' ) def version(self): return self._fetch('__version__') def monitors(self): return self._fetch('services/monitor.json') def main(env: ENV_ENUM, verbose: bool = False, retries: int = 0, attempt: int = 0): fetcher = Fetcher(env, verbose) version_data = fetcher.version() monitors_data = fetcher.monitors() has_failures = any( monitor['state'] is False for monitor in monitors_data.get('data', {}).values() ) if has_failures and attempt < retries: wait_for = 2**attempt if verbose: print( f'Monitors reported failures. Waiting {wait_for} seconds before ' f'retrying check (attempt {attempt + 1}/{retries})...' ) time.sleep(wait_for) return main(env, verbose, retries, attempt + 1) results = { 'environment': env, 'version': version_data, 'monitors': monitors_data, } return results, has_failures if __name__ == '__main__': args = argparse.ArgumentParser() args.add_argument( '--env', type=str, choices=list(ENV_ENUM.__members__.keys()), required=True ) args.add_argument('--output', type=str) args.add_argument('--verbose', action='store_true') args.add_argument( '--retries', type=int, default=3, help=( 'Number of times to retry the *entire* health check if monitors report ' 'failures. Default is 3.' ), ) args = args.parse_args() data, has_failures = main(args.env, args.verbose, args.retries) if args.output: with open(args.output, 'w') as f: json_data = json.dumps(data, indent=2) f.write(json_data) if args.verbose: print(f'Health check data saved to {args.output}') if has_failures: raise ValueError(f'Health check failed: {data}')