fbnet/command_runner/command

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import asyncio import functools import inspect import random import re import sys import typing from dataclasses import dataclass from functools import wraps from itertools import islice from uuid import uuid4 from fb303_asyncio.FacebookBase import FacebookBase from fbnet.command_runner_asyncio.CommandRunner import constants, ttypes from fbnet.command_runner_asyncio.CommandRunner.Command import Iface as FcrIface from .command_session import CommandSession, CapturedTimeMS from .counters import Counters from .exceptions import ensure_thrift_exception, convert_to_fcr_exception from .global_namespace import GlobalNamespace from .options import Option from .utils import input_fields_validator @dataclass(frozen=True) class DeviceResult: """ Class for passing information received after sending commands to device Includes CommandResult(s), captured communication/processing time, and any raised exceptions (remains None for exceptions that are returned instead) """ device_response: typing.Union[ typing.List[typing.Optional[ttypes.CommandResult]], typing.Optional[ttypes.CommandResult], ] = None captured_time_ms: CapturedTimeMS = CapturedTimeMS() exception: typing.Optional[Exception] = None def _append_debug_info_to_exception(fn): @wraps(fn) async def wrapper(self, *args, **kwargs): try: return await fn(self, *args, **kwargs) except Exception as ex: # Retrieve request uuid from the global namespace uuid = GlobalNamespace.get_request_uuid() # Exception defined in command runner thrift spec has attribute 'message' if hasattr(ex, "message"): ex.message = await self.add_debug_info_to_error_message( # noqa error_msg=ex.message, uuid=uuid # noqa ) raise ex else: # Don't pass in ex.message to debug info to avoid duplicate message debug_info = await self.add_debug_info_to_error_message( error_msg="", uuid=uuid ) # Append message as new arg instead of constructing new exception # to account for exceptions having different required args ex.args = ex.args + (debug_info,) raise ex.with_traceback(sys.exc_info()[2]) return wrapper def _ensure_uuid(fn): """Make sure the 'uuid' parameter for both input and return is non-empty""" @wraps(fn) async def wrapper(*args, **kwargs): uuid = "" callargs = inspect.getcallargs(fn, *args, **kwargs) if "uuid" in callargs: uuid = callargs["uuid"] or uuid4().hex[:8] callargs["uuid"] = uuid # Note: this won't work for functions that specify positional-only or # kwarg-only parameters. result = await fn(**callargs) # Set UUID on the resulting struct -- if it supports it: Map of, or # raw, CommandResult and Session if isinstance(result, (ttypes.CommandResult, ttypes.Session)): result.uuid = uuid elif isinstance(result, dict): for val in result.values(): # If size of result is large, we can refactor to only check the # first element. (time taken: ~100ns * N) if not isinstance(val, ttypes.CommandResult): break val.uuid = uuid return result return wrapper class CommandHandler(Counters, FacebookBase, FcrIface): """ Command implementation for api defined in thrift for command runner """ _COUNTER_PREFIX = "fbnet.command_runner" _LB_THRESHOLD = 100 REMOTE_CALL_OVERHEAD = Option( "--remote_call_overhead", help="Overhead for running commands remotely (for bulk calls)", type=int, default=20, ) LB_THRESHOLD = Option( "--lb_threshold", help="""Load Balance threashold for bulk_run calls. If number of devices is greater than this threashold, the requests are broken and send to other instances using bulk_run_local() api""", type=int, default=100, ) BULK_SESSION_LIMIT = Option( "--bulk_session_limit", help="""session limit above which we reject the bulk run local calls""", type=int, default=200, ) BULK_RETRY_LIMIT = Option( "--bulk_retry_limit", help="""number of times to retry bulk call on the remote instances""", type=int, default=5, ) BULK_RUN_JITTER = Option( "--bulk_run_jitter", help="""A random delay added for bulk commands to stagger the calls to distribute the load.""", type=int, default=5, ) BULK_RETRY_DELAY_MIN = Option( "--bulk_retry_delay_min", help="""number of seconds to wait before retrying""", type=int, default=5, ) BULK_RETRY_DELAY_MAX = Option( "--bulk_retry_delay_max", help="""number of seconds to wait before retrying""", type=int, default=10, ) _bulk_session_count = 0 def __init__(self, service, name=None): Counters.__init__(self, service, name) FacebookBase.__init__(self, service.app_name) FcrIface.__init__(self) self.service.register_stats_mgr(self) def cleanup(self): pass @classmethod def register_counters(cls, stats_mgr): stats_mgr.register_counter("bulk_run.remote") stats_mgr.register_counter("bulk_run.local") stats_mgr.register_counter("bulk_run.local.overload_error") def getCounters(self): ret = {} for key, value in self.counters.items(): if not key.startswith(self._COUNTER_PREFIX): key = self._COUNTER_PREFIX + "." + key ret[key] = value() if callable(value) else value return ret @classmethod def _set_bulk_session_count(cls, new_count: int) -> None: """Method to set the class variable _bulk_session_count""" cls._bulk_session_count = new_count async def add_debug_info_to_error_message(self, uuid, error_msg=None): if not error_msg: return f"(DebugInfo: thrift_uuid={uuid})" return f"{error_msg} (DebugInfo: thrift_uuid={uuid})" @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def run( self, command, device, timeout, open_timeout, client_ip, client_port, uuid ) -> ttypes.CommandResult: result = await self._run_commands( [command], device, timeout, open_timeout, client_ip, client_port, uuid ) GlobalNamespace.set_api_captured_time_ms( GlobalNamespace.get_api_captured_time_ms() + result.captured_time_ms ) # Raise exception if needed here since _run_commands does not raise # and exceptions are not returned if isinstance(result.exception, Exception): raise result.exception cmd_result = result.device_response if isinstance(cmd_result, list): cmd_result = result.device_response[0] # pyre-ignore checked is list return cmd_result def _bulk_failure(self, device_to_commands, message): def command_failures(cmds): return [ ttypes.CommandResult( output=message, status=constants.FAILURE_STATUS, command=cmd ) for cmd in cmds ] return { self._get_result_key(dev): command_failures(cmds) for dev, cmds in device_to_commands.items() } @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def bulk_run_v2( self, request: ttypes.BulkRunCommandRequest ) -> ttypes.BulkRunCommandResponse: device_to_commands = {} for device_commands in request.device_commands_list: device_details = device_commands.device # Since this is in the handler, is it okay to have struct as a key? device_to_commands[device_details] = device_commands.commands result = await self.bulk_run( device_to_commands, request.timeout, request.open_timeout, request.client_ip, request.client_port, request.uuid, ) response = ttypes.BulkRunCommandResponse() response.device_to_result = result return response @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def bulk_run( self, device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ) -> typing.Dict[str, typing.List[ttypes.CommandResult]]: if (len(device_to_commands) < self.LB_THRESHOLD) and ( self._bulk_session_count < self.BULK_SESSION_LIMIT ): # Run these command locally. self.incrementCounter("bulk_run.local") return await self._bulk_run_local( device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ) async def _remote_task(chunk): # Run the chunk of commands on remote instance self.incrementCounter("bulk_run.remote") retry_count = 0 while True: try: return await self._bulk_run_remote( chunk, timeout, open_timeout, client_ip, client_port, uuid ) except Exception as e: if isinstance(e, ttypes.InstanceOverloaded): # Instance we ran the call on was overloaded. We can retry # the command again, hopefully on a different instance self.incrementCounter("bulk_run.remote.overload_error") self.logger.error("Instance Overloaded: %d: %s", retry_count, e) if ( self._remote_task_should_retry(ex=e) and retry_count <= self.BULK_RETRY_LIMIT ): # Stagger the retries delay = random.uniform( self.BULK_RETRY_DELAY_MIN, self.BULK_RETRY_DELAY_MAX ) await asyncio.sleep(delay) retry_count += 1 else: # Append message as new arg instead of constructing new exception # to account for exceptions having different required args e.args = e.args + ("bulk_run_remote failed",) return self._bulk_failure(chunk, str(e)) # Split the request into chunks and run them on remote hosts tasks = [ _remote_task(chunk) for chunk in self._chunked_dict(device_to_commands, self.LB_THRESHOLD) ] all_results = {} for task in asyncio.as_completed(tasks, loop=self.loop): result = await task all_results.update(result) return all_results @ensure_thrift_exception @_ensure_uuid async def bulk_run_local( self, device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ): return await self._bulk_run_local( device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ) @_ensure_uuid async def _bulk_run_local( self, device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ) -> typing.Dict[str, typing.List[ttypes.CommandResult]]: devices = sorted(device_to_commands.keys(), key=lambda d: d.hostname) session_count = self._bulk_session_count if session_count + len(device_to_commands) > self.BULK_SESSION_LIMIT: self.logger.error("Too many session open: %d", session_count) raise ttypes.InstanceOverloaded( message="Too many session open: %d" % session_count ) self._set_bulk_session_count(self._bulk_session_count + len(devices)) async def _run_one_device(device) -> DeviceResult: if not device_to_commands[device]: return DeviceResult(device_response=[]) # Instead of running all commands at once, stagger the commands to # distribute the load delay = random.uniform(0, self.BULK_RUN_JITTER) await asyncio.sleep(delay) return await self._run_commands( device_to_commands[device], device, timeout, open_timeout, client_ip, client_port, uuid, return_exceptions=True, ) captured_time_ms_list = [] try: commands = [] for device in devices: commands.append(_run_one_device(device)) # Run commands in parallel results: typing.List[DeviceResult] = await asyncio.gather( *commands, loop=self.loop, return_exceptions=True ) # pyre-ignore: _run_commands always returns list of CommandResults # when return_exceptions is True cmd_results: typing.List[typing.List[ttypes.CommandResult]] = [ result.device_response for result in results ] # Get the captured communication/processing time to save later in finally block captured_time_ms_list = [result.captured_time_ms for result in results] finally: self._set_bulk_session_count(self._bulk_session_count - len(devices)) GlobalNamespace.set_api_captured_time_ms( functools.reduce( CapturedTimeMS.__add__, [GlobalNamespace.get_api_captured_time_ms()] + captured_time_ms_list, ) ) return { self._get_result_key(dev): res for dev, res in zip(devices, cmd_results) } @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def open_session( self, device, open_timeout, idle_timeout, client_ip, client_port, uuid ): return await self._open_session( device, open_timeout, idle_timeout, client_ip, client_port, uuid, raw_session=False, ) @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def run_session( self, session, command, timeout, client_ip, client_port, uuid ): return await self._run_session( session, command, timeout, client_ip, client_port, uuid ) @ensure_thrift_exception @input_fields_validator @_append_debug_info_to_exception @_ensure_uuid async def close_session(self, session, client_ip, client_port, uuid): closed_session = None try: closed_session = CommandSession.get(session.id, client_ip, client_port) # Reset captured time field so we don't include # the time from a previous API call closed_session.captured_time_ms.reset_time() await closed_session.close() except Exception as e: # Append message as new arg instead of constructing new exception # to account for exceptions having different required args e.args = e.args + ("close_session failed",) raise e finally: if closed_session: GlobalNamespace.set_api_captured_time_ms( GlobalNamespace.get_api_captured_time_ms() + closed_session.captured_time_ms ) @ensure_thrift_exception @_append_debug_info_to_exception @_ensure_uuid async def open_raw_session( self, device, open_timeout, idle_timeout, client_ip, client_port, uuid ): return await self._open_session( device, open_timeout, idle_timeout, client_ip, client_port, uuid, raw_session=True, ) @ensure_thrift_exception @_append_debug_info_to_exception @_ensure_uuid async def run_raw_session( self, tsession, command, timeout, prompt_regex, client_ip, client_port, uuid ): if not prompt_regex: raise ttypes.SessionException(message="prompt_regex not specified") prompt_re = re.compile(prompt_regex.encode("utf8"), re.M) return await self._run_session( tsession, command, timeout, client_ip, client_port, uuid, prompt_re ) @ensure_thrift_exception @_append_debug_info_to_exception @_ensure_uuid async def close_raw_session(self, tsession, client_ip, client_port, uuid): return await self.close_session(tsession, client_ip, client_port, uuid) async def _open_session( self, device, open_timeout, idle_timeout, client_ip, client_port, uuid, raw_session=False, ): options = self._get_options( device, client_ip, client_port, open_timeout, idle_timeout, raw_session=raw_session, ) session = None try: devinfo = await self._lookup_device(device) session = await devinfo.setup_session( self.service, device, options, loop=self.loop ) return ttypes.Session( id=session.id, name=session.hostname, hostname=device.hostname ) except Exception as e: # Append message as new arg instead of constructing new exception # to account for exceptions having different required args e.args = e.args + ("open_session failed",) raise e finally: if session: GlobalNamespace.set_api_captured_time_ms( GlobalNamespace.get_api_captured_time_ms() + session.captured_time_ms ) async def _run_session( self, tsession, command, timeout, client_ip, client_port, uuid, prompt_re=None ): session = None captured_time_ms = CapturedTimeMS() try: session = CommandSession.get(tsession.id, client_ip, client_port) # Reset captured time field so we don't include # the time from a previous API call session.captured_time_ms.reset_time() return await self._run_command(session, command, timeout, uuid, prompt_re) except Exception as e: # Append message as new arg instead of constructing new exception # to account for exceptions having different required args e.args = e.args + ("run_session failed",) raise e finally: if session: captured_time_ms = session.captured_time_ms GlobalNamespace.set_api_captured_time_ms( GlobalNamespace.get_api_captured_time_ms() + captured_time_ms ) def _get_result_key(self, device): # TODO: just returning the hostname for now. Some additional processing # may be required e.g. using shortnames, adding console info, etc return device.hostname async def _run_command(self, session, command, timeout, uuid, prompt_re=None): self.logger.info(f"[request_id={uuid}]: Run command with session {session.id}") output = await session.run_command( command.encode("utf8"), timeout=timeout, prompt_re=prompt_re ) return session.build_result( output=output.decode("utf8", errors="ignore"), status=session.exit_status or constants.SUCCESS_STATUS, command=command, ) async def _run_commands( self, commands, device, timeout, open_timeout, client_ip, client_port, uuid, return_exceptions=False, ) -> DeviceResult: options = self._get_options( device, client_ip, client_port, open_timeout, timeout ) if device.command_prompts: options["command_prompts"] = { c.encode(): p.encode() for c, p in device.command_prompts.items() } command = commands[0] devinfo = None session = None results = [] captured_time_ms = CapturedTimeMS() exc = None try: devinfo = await self._lookup_device(device) async with devinfo.create_session( self.service, device, options, loop=self.loop ) as session: for command in commands: result = await self._run_command(session, command, timeout, uuid) results.append(result) except Exception as e: await self._record_error(e, command, uuid, options, devinfo, session) if return_exceptions: if not isinstance(e, ttypes.SessionException): e = convert_to_fcr_exception(e) e = ttypes.SessionException(message=f"{e!s}", code=e._CODE) e.message = await self.add_debug_info_to_error_message( # noqa error_msg=e.message, uuid=uuid # noqa ) results = [ ttypes.CommandResult(output="", status="%r" % e, command=command) ] else: # save exc from the original place so we have full stacktrace # when we raise in API level later exc = e if session: captured_time_ms = session.captured_time_ms return DeviceResult( device_response=results, captured_time_ms=captured_time_ms, exception=exc, ) def _chunked_dict(self, data, chunk_size): """split the dict into smaller dicts""" items = iter(data.items()) # get an iterator for items for _ in range(0, len(data), chunk_size): yield dict(islice(items, chunk_size)) async def _bulk_run_remote( self, device_to_commands, timeout, open_timeout, client_ip, client_port, uuid ): # Determine a timeout for remote call. call_timeout = open_timeout + timeout remote_timeout = timeout - self.REMOTE_CALL_OVERHEAD # Make sure we have a sane timeout value assert remote_timeout > 10, "timeout: '%d' value too low for bulk_run" % timeout fcr_client = await self._get_fcr_client(timeout=call_timeout) async with fcr_client as client: result = await client.bulk_run_local( device_to_commands, remote_timeout, open_timeout, client_ip, client_port, uuid, ) return result async def _lookup_device(self, device): return await self.service.device_db.get(device) async def _get_fcr_client(self, timeout): return await self.service.get_fcr_client(timeout=timeout) def _get_options( self, device, client_ip, client_port, open_timeout, idle_timeout, raw_session=False, ): options = { "username": self._get_device_username(device), "password": self._get_device_password(device), "console": device.console, "command_prompts": {}, "client_ip": client_ip, "client_port": client_port, "mgmt_ip": device.mgmt_ip or False, "open_timeout": open_timeout, "idle_timeout": idle_timeout, "ip_address": device.ip_address, "session_type": device.session_type, "device": device, "raw_session": raw_session, "clear_command": device.clear_command, } if device.command_prompts: options["command_prompts"] = { c.encode(): p.encode() for c, p in device.command_prompts.items() } return options def _get_device_username(self, device): return device.username def _get_device_password(self, device): # If the username is specified then password must also be specified. if device.username: return self._decrypt(device.password) def _decrypt(self, data): return data async def _record_error( self, error, command, uuid, options, devinfo, session, **kwargs ): """ Subclass can override this method to export the interested error messages to proper data store """ pass def _remote_task_should_retry(self, ex: Exception) -> bool: """ The function that decides whether a remote task (fan out request) should or should not retry """ if not ex: return False if isinstance(ex, ttypes.InstanceOverloaded): return True return False

fbnet/command_runner/command_handler.py (567 lines of code) (raw):