azurelinuxagent/common/protocol/wire.py (892 lines of code) (raw):
# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
import json
import os
import random
import shutil
import time
import zipfile
from collections import defaultdict
from datetime import datetime, timedelta
from xml.sax import saxutils
from azurelinuxagent.common import conf
from azurelinuxagent.common import logger
from azurelinuxagent.common.utils import textutil
from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_crp, SupportedFeatureNames
from azurelinuxagent.common.datacontract import validate_param
from azurelinuxagent.common.event import add_event, WALAEventOperation, report_event, \
CollectOrReportEventDebugInfo, add_periodic
from azurelinuxagent.common.exception import ProtocolNotFoundError, \
ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError, ExtensionErrorCodes
from azurelinuxagent.common.future import httpclient, bytebuffer, ustr
from azurelinuxagent.common.protocol.goal_state import GoalState, TRANSPORT_CERT_FILE_NAME, TRANSPORT_PRV_FILE_NAME, GoalStateProperties
from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol
from azurelinuxagent.common.protocol.restapi import DataContract, ProvisionStatus, VMInfo, VMStatus
from azurelinuxagent.common.telemetryevent import GuestAgentExtensionEventsSchema
from azurelinuxagent.common.utils import fileutil, restutil
from azurelinuxagent.common.utils.cryptutil import CryptUtil
from azurelinuxagent.common.utils.restutil import TELEMETRY_THROTTLE_DELAY_IN_SECONDS, \
TELEMETRY_FLUSH_THROTTLE_DELAY_IN_SECONDS, TELEMETRY_DATA
from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, \
findtext, gettext, remove_bom, get_bytes_from_pem, parse_json
from azurelinuxagent.common.version import AGENT_NAME, CURRENT_VERSION
VERSION_INFO_URI = "http://{0}/?comp=versions"
HEALTH_REPORT_URI = "http://{0}/machine?comp=health"
ROLE_PROP_URI = "http://{0}/machine?comp=roleProperties"
TELEMETRY_URI = "http://{0}/machine?comp={1}"
PROTOCOL_VERSION = "2012-11-30"
ENDPOINT_FINE_NAME = "WireServer"
SHORT_WAITING_INTERVAL = 1 # 1 second
MAX_EVENT_BUFFER_SIZE = 2 ** 16 - 2 ** 10
_DOWNLOAD_TIMEOUT = timedelta(minutes=5)
class UploadError(HttpError):
pass
class WireProtocol(DataContract):
def __init__(self, endpoint):
if endpoint is None:
raise ProtocolError("WireProtocol endpoint is None")
self.client = WireClient(endpoint)
def detect(self, init_goal_state=True, save_to_history=False):
self.client.check_wire_protocol_version()
trans_prv_file = os.path.join(conf.get_lib_dir(),
TRANSPORT_PRV_FILE_NAME)
trans_cert_file = os.path.join(conf.get_lib_dir(),
TRANSPORT_CERT_FILE_NAME)
cryptutil = CryptUtil(conf.get_openssl_cmd())
cryptutil.gen_transport_cert(trans_prv_file, trans_cert_file)
# Initialize the goal state, including all the inner properties
if init_goal_state:
logger.info('Initializing goal state during protocol detection')
#
# TODO: Currently protocol detection retrieves the entire goal state. This is not needed; in particular, retrieving the Extensions goal state
# is not needed. However, the goal state is cached in self.client._goal_state and other components, including the Extension Handler,
# depend on this cached value. This has been a long-standing issue that causes multiple problems. Before removing the cached goal state,
# though, a careful review of these dependencies is needed. One of the problems of fetching the full goal state is that issues while
# retrieving it can block protocol detection and make the Agent go into a retry loop that can last 1 full hour.
#
self.client.reset_goal_state(save_to_history=save_to_history)
def update_host_plugin_from_goal_state(self):
self.client.update_host_plugin_from_goal_state()
def get_endpoint(self):
return self.client.get_endpoint()
def get_vminfo(self):
goal_state = self.client.get_goal_state()
hosting_env = self.client.get_hosting_env()
vminfo = VMInfo()
vminfo.subscriptionId = None
vminfo.vmName = hosting_env.vm_name
vminfo.tenantName = hosting_env.deployment_name
vminfo.roleName = hosting_env.role_name
vminfo.roleInstanceName = goal_state.role_instance_id
return vminfo
def get_certs(self):
certificates = self.client.get_certs()
return certificates.cert_list
def get_goal_state(self):
return self.client.get_goal_state()
def report_provision_status(self, provision_status):
validate_param("provision_status", provision_status, ProvisionStatus)
if provision_status.status is not None:
self.client.report_health(provision_status.status,
provision_status.subStatus,
provision_status.description)
if provision_status.properties.certificateThumbprint is not None:
thumbprint = provision_status.properties.certificateThumbprint
self.client.report_role_prop(thumbprint)
def report_vm_status(self, vm_status):
validate_param("vm_status", vm_status, VMStatus)
self.client.status_blob.set_vm_status(vm_status)
self.client.upload_status_blob()
def report_event(self, events_iterator, flush=False):
return self.client.report_event(events_iterator, flush)
def upload_logs(self, logs):
self.client.upload_logs(logs)
def get_status_blob_data(self):
return self.client.status_blob.data
def _build_role_properties(container_id, role_instance_id, thumbprint):
xml = (u"<?xml version=\"1.0\" encoding=\"utf-8\"?>"
u"<RoleProperties>"
u"<Container>"
u"<ContainerId>{0}</ContainerId>"
u"<RoleInstances>"
u"<RoleInstance>"
u"<Id>{1}</Id>"
u"<Properties>"
u"<Property name=\"CertificateThumbprint\" value=\"{2}\" />"
u"</Properties>"
u"</RoleInstance>"
u"</RoleInstances>"
u"</Container>"
u"</RoleProperties>"
u"").format(container_id, role_instance_id, thumbprint)
return xml
def _build_health_report(incarnation, container_id, role_instance_id,
status, substatus, description):
# The max description that can be sent to WireServer is 4096 bytes.
# Exceeding this max can result in a failure to report health.
# To keep this simple, we will keep a 10% buffer and trim before
# encoding the description.
if description:
max_chars_before_encoding = 3686
len_before_trim = len(description)
description = description[:max_chars_before_encoding]
trimmed_char_count = len_before_trim - len(description)
if trimmed_char_count > 0:
logger.info(
'Trimmed health report description by {0} characters'.format(
trimmed_char_count
)
)
# Escape '&', '<' and '>'
description = saxutils.escape(ustr(description))
detail = u''
if substatus is not None:
substatus = saxutils.escape(ustr(substatus))
detail = (u"<Details>"
u"<SubStatus>{0}</SubStatus>"
u"<Description>{1}</Description>"
u"</Details>").format(substatus, description)
xml = (u"<?xml version=\"1.0\" encoding=\"utf-8\"?>"
u"<Health "
u"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\""
u" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">"
u"<GoalStateIncarnation>{0}</GoalStateIncarnation>"
u"<Container>"
u"<ContainerId>{1}</ContainerId>"
u"<RoleInstanceList>"
u"<Role>"
u"<InstanceId>{2}</InstanceId>"
u"<Health>"
u"<State>{3}</State>"
u"{4}"
u"</Health>"
u"</Role>"
u"</RoleInstanceList>"
u"</Container>"
u"</Health>"
u"").format(incarnation,
container_id,
role_instance_id,
status,
detail)
return xml
def ga_status_to_guest_info(ga_status):
"""
Convert VMStatus object to status blob format
"""
v1_ga_guest_info = {
"computerName": ga_status.hostname,
"osName": ga_status.osname,
"osVersion": ga_status.osversion,
"version": ga_status.version,
}
return v1_ga_guest_info
def __get_formatted_msg_for_status_reporting(msg, lang="en-US"):
return {
'lang': lang,
'message': msg
}
def _get_utc_timestamp_for_status_reporting(time_format="%Y-%m-%dT%H:%M:%SZ", timestamp=None):
timestamp = time.gmtime() if timestamp is None else timestamp
return time.strftime(time_format, timestamp)
def ga_status_to_v1(ga_status):
v1_ga_status = {
"version": ga_status.version,
"status": ga_status.status,
"formattedMessage": __get_formatted_msg_for_status_reporting(ga_status.message)
}
if ga_status.update_status is not None:
v1_ga_status["updateStatus"] = get_ga_update_status_to_v1(ga_status.update_status)
return v1_ga_status
def get_ga_update_status_to_v1(update_status):
v1_ga_update_status = {
"expectedVersion": update_status.expected_version,
"status": update_status.status,
"code": update_status.code,
"formattedMessage": __get_formatted_msg_for_status_reporting(update_status.message)
}
return v1_ga_update_status
def ext_substatus_to_v1(sub_status_list):
status_list = []
for substatus in sub_status_list:
status = {
"name": substatus.name,
"status": substatus.status,
"code": substatus.code,
"formattedMessage": __get_formatted_msg_for_status_reporting(substatus.message)
}
status_list.append(status)
return status_list
def ext_status_to_v1(ext_status):
if ext_status is None:
return None
timestamp = _get_utc_timestamp_for_status_reporting()
v1_sub_status = ext_substatus_to_v1(ext_status.substatusList)
v1_ext_status = {
"status": {
"name": ext_status.name,
"configurationAppliedTime": ext_status.configurationAppliedTime,
"operation": ext_status.operation,
"status": ext_status.status,
"code": ext_status.code,
"formattedMessage": __get_formatted_msg_for_status_reporting(ext_status.message)
},
"version": 1.0,
"timestampUTC": timestamp
}
if len(v1_sub_status) != 0:
v1_ext_status['status']['substatus'] = v1_sub_status
return v1_ext_status
def ext_handler_status_to_v1(ext_handler_status):
v1_handler_status = {
'handlerVersion': ext_handler_status.version,
'handlerName': ext_handler_status.name,
'status': ext_handler_status.status,
'code': ext_handler_status.code,
'useExactVersion': True
}
if ext_handler_status.message is not None:
v1_handler_status["formattedMessage"] = __get_formatted_msg_for_status_reporting(ext_handler_status.message)
v1_ext_status = ext_status_to_v1(ext_handler_status.extension_status)
if ext_handler_status.extension_status is not None and v1_ext_status is not None:
v1_handler_status["runtimeSettingsStatus"] = {
'settingsStatus': v1_ext_status,
'sequenceNumber': ext_handler_status.extension_status.sequenceNumber
}
# Add extension name if Handler supports MultiConfig
if ext_handler_status.supports_multi_config:
v1_handler_status["runtimeSettingsStatus"]["extensionName"] = ext_handler_status.extension_status.name
return v1_handler_status
def vm_artifacts_aggregate_status_to_v1(vm_artifacts_aggregate_status):
gs_aggregate_status = vm_artifacts_aggregate_status.goal_state_aggregate_status
if gs_aggregate_status is None:
return None
v1_goal_state_aggregate_status = {
"formattedMessage": __get_formatted_msg_for_status_reporting(gs_aggregate_status.message),
"timestampUTC": _get_utc_timestamp_for_status_reporting(timestamp=gs_aggregate_status.processed_time),
"inSvdSeqNo": gs_aggregate_status.in_svd_seq_no,
"status": gs_aggregate_status.status,
"code": gs_aggregate_status.code
}
v1_artifact_aggregate_status = {
"goalStateAggregateStatus": v1_goal_state_aggregate_status
}
return v1_artifact_aggregate_status
def vm_status_to_v1(vm_status):
timestamp = _get_utc_timestamp_for_status_reporting()
v1_ga_guest_info = ga_status_to_guest_info(vm_status.vmAgent)
v1_ga_status = ga_status_to_v1(vm_status.vmAgent)
v1_vm_artifact_aggregate_status = vm_artifacts_aggregate_status_to_v1(
vm_status.vmAgent.vm_artifacts_aggregate_status)
v1_handler_status_list = []
for handler_status in vm_status.vmAgent.extensionHandlers:
v1_handler_status_list.append(ext_handler_status_to_v1(handler_status))
v1_agg_status = {
'guestAgentStatus': v1_ga_status,
'handlerAggregateStatus': v1_handler_status_list
}
if v1_vm_artifact_aggregate_status is not None:
v1_agg_status['vmArtifactsAggregateStatus'] = v1_vm_artifact_aggregate_status
v1_vm_status = {
'version': '1.1',
'timestampUTC': timestamp,
'aggregateStatus': v1_agg_status,
'guestOSInfo': v1_ga_guest_info
}
supported_features = []
for _, feature in get_agent_supported_features_list_for_crp().items():
supported_features.append(
{
"Key": feature.name,
"Value": feature.version
}
)
if vm_status.vmAgent.supports_fast_track:
supported_features.append(
{
"Key": SupportedFeatureNames.FastTrack,
"Value": "1.0" # This is a dummy version; CRP ignores it
}
)
if supported_features:
v1_vm_status["supportedFeatures"] = supported_features
return v1_vm_status
class StatusBlob(object):
def __init__(self, client):
self.vm_status = None
self.client = client
self.type = None
self.data = None
def set_vm_status(self, vm_status):
validate_param("vmAgent", vm_status, VMStatus)
self.vm_status = vm_status
def to_json(self):
report = vm_status_to_v1(self.vm_status)
return json.dumps(report)
__storage_version__ = "2014-02-14"
def prepare(self, blob_type):
logger.verbose("Prepare status blob")
self.data = self.to_json()
self.type = blob_type
def upload(self, url):
try:
if not self.type in ["BlockBlob", "PageBlob"]:
raise ProtocolError("Illegal blob type: {0}".format(self.type))
if self.type == "BlockBlob":
self.put_block_blob(url, self.data)
else:
self.put_page_blob(url, self.data)
return True
except Exception as e:
logger.verbose("Initial status upload failed: {0}", e)
return False
def get_block_blob_headers(self, blob_size):
return {
"Content-Length": ustr(blob_size),
"x-ms-blob-type": "BlockBlob",
"x-ms-date": _get_utc_timestamp_for_status_reporting(),
"x-ms-version": self.__class__.__storage_version__
}
def put_block_blob(self, url, data):
logger.verbose("Put block blob")
headers = self.get_block_blob_headers(len(data))
resp = self.client.call_storage_service(restutil.http_put, url, data, headers)
if resp.status != httpclient.CREATED:
raise UploadError(
"Failed to upload block blob: {0}".format(resp.status))
def get_page_blob_create_headers(self, blob_size):
return {
"Content-Length": "0",
"x-ms-blob-content-length": ustr(blob_size),
"x-ms-blob-type": "PageBlob",
"x-ms-date": _get_utc_timestamp_for_status_reporting(),
"x-ms-version": self.__class__.__storage_version__
}
def get_page_blob_page_headers(self, start, end):
return {
"Content-Length": ustr(end - start),
"x-ms-date": _get_utc_timestamp_for_status_reporting(),
"x-ms-range": "bytes={0}-{1}".format(start, end - 1),
"x-ms-page-write": "update",
"x-ms-version": self.__class__.__storage_version__
}
def put_page_blob(self, url, data):
logger.verbose("Put page blob")
# Convert string into bytes and align to 512 bytes
data = bytearray(data, encoding='utf-8')
page_blob_size = int((len(data) + 511) / 512) * 512
headers = self.get_page_blob_create_headers(page_blob_size)
resp = self.client.call_storage_service(restutil.http_put, url, "", headers)
if resp.status != httpclient.CREATED:
raise UploadError(
"Failed to clean up page blob: {0}".format(resp.status))
if url.count("?") <= 0:
url = "{0}?comp=page".format(url)
else:
url = "{0}&comp=page".format(url)
logger.verbose("Upload page blob")
page_max = 4 * 1024 * 1024 # Max page size: 4MB
start = 0
end = 0
while end < len(data):
end = min(len(data), start + page_max)
content_size = end - start
# Align to 512 bytes
page_end = int((end + 511) / 512) * 512
buf_size = page_end - start
buf = bytearray(buf_size)
buf[0: content_size] = data[start: end]
headers = self.get_page_blob_page_headers(start, page_end)
resp = self.client.call_storage_service(
restutil.http_put,
url,
bytebuffer(buf),
headers)
if resp is None or resp.status != httpclient.CREATED:
raise UploadError(
"Failed to upload page blob: {0}".format(resp.status))
start = end
def event_param_to_v1(param):
param_format = ustr('<Param Name="{0}" Value={1} T="{2}" />')
param_type = type(param.value)
attr_type = ""
if param_type is int:
attr_type = 'mt:uint64'
elif param_type is str:
attr_type = 'mt:wstr'
elif ustr(param_type).count("'unicode'") > 0:
attr_type = 'mt:wstr'
elif param_type is bool:
attr_type = 'mt:bool'
elif param_type is float:
attr_type = 'mt:float64'
return param_format.format(param.name,
saxutils.quoteattr(ustr(param.value)),
attr_type)
def event_to_v1_encoded(event, encoding='utf-8'):
params = ""
for param in event.parameters:
params += event_param_to_v1(param)
event_str = ustr('<Event id="{0}"><![CDATA[{1}]]></Event>').format(event.eventId, params)
return event_str.encode(encoding)
class WireClient(object):
def __init__(self, endpoint):
logger.info("Wire server endpoint:{0}", endpoint)
self._endpoint = endpoint
self._goal_state = None
self._host_plugin = None
self.status_blob = StatusBlob(self)
def get_endpoint(self):
return self._endpoint
def call_wireserver(self, http_req, *args, **kwargs):
try:
# Never use the HTTP proxy for wireserver
kwargs['use_proxy'] = False
resp = http_req(*args, **kwargs)
if restutil.request_failed(resp):
msg = "[Wireserver Failed] URI {0} ".format(args[0])
if resp is not None:
msg += " [HTTP Failed] Status Code {0}".format(resp.status)
raise ProtocolError(msg)
# If the GoalState is stale, pass along the exception to the caller
except ResourceGoneError:
raise
except Exception as e:
raise ProtocolError("[Wireserver Exception] {0}".format(ustr(e)))
return resp
def decode_config(self, data):
if data is None:
return None
data = remove_bom(data)
xml_text = ustr(data, encoding='utf-8')
return xml_text
def fetch_config(self, uri, headers):
resp = self.call_wireserver(restutil.http_get, uri, headers=headers)
return self.decode_config(resp.read())
@staticmethod
def call_storage_service(http_req, *args, **kwargs):
# Default to use the configured HTTP proxy
if not 'use_proxy' in kwargs or kwargs['use_proxy'] is None:
kwargs['use_proxy'] = True
return http_req(*args, **kwargs)
def fetch_artifacts_profile_blob(self, uri):
return self._fetch_content("artifacts profile blob", [uri], use_verify_header=False)[1] # _fetch_content returns a (uri, content) tuple
def fetch_manifest(self, manifest_type, uris, use_verify_header):
uri, content = self._fetch_content("{0} manifest".format(manifest_type), uris, use_verify_header=use_verify_header)
self.get_host_plugin().update_manifest_uri(uri)
return content
def _fetch_content(self, download_type, uris, use_verify_header):
"""
Walks the given list of 'uris' issuing HTTP GET requests; returns a tuple with the URI and the content of the first successful request.
The 'download_type' is added to any log messages produced by this method; it should describe the type of content of the given URIs
(e.g. "manifest", "extension package", etc).
"""
host_ga_plugin = self.get_host_plugin()
direct_download = lambda uri: self.fetch(uri)[0]
def hgap_download(uri):
request_uri, request_headers = host_ga_plugin.get_artifact_request(uri, use_verify_header=use_verify_header)
response, _ = self.fetch(request_uri, request_headers, use_proxy=False, retry_codes=restutil.HGAP_GET_EXTENSION_ARTIFACT_RETRY_CODES)
return response
return self._download_with_fallback_channel(download_type, uris, direct_download=direct_download, hgap_download=hgap_download)
def download_zip_package(self, package_type, uris, target_file, target_directory, use_verify_header):
"""
Downloads the ZIP package specified in 'uris' (which is a list of alternate locations for the ZIP), saving it to 'target_file' and then expanding
its contents to 'target_directory'. Deletes the target file after it has been expanded.
The 'package_type' is only used in log messages and has no other semantics. It should specify the contents of the ZIP, e.g. "extension package"
or "agent package"
The 'use_verify_header' parameter indicates whether the verify header should be added when using the extensionArtifact API of the HostGAPlugin.
"""
host_ga_plugin = self.get_host_plugin()
direct_download = lambda uri: self.stream(uri, target_file, headers=None, use_proxy=True)
def hgap_download(uri):
request_uri, request_headers = host_ga_plugin.get_artifact_request(uri, use_verify_header=use_verify_header, artifact_manifest_url=host_ga_plugin.manifest_uri)
return self.stream(request_uri, target_file, headers=request_headers, use_proxy=False)
on_downloaded = lambda: WireClient._try_expand_zip_package(package_type, target_file, target_directory)
self._download_with_fallback_channel(package_type, uris, direct_download=direct_download, hgap_download=hgap_download, on_downloaded=on_downloaded)
def _download_with_fallback_channel(self, download_type, uris, direct_download, hgap_download, on_downloaded=None):
"""
Walks the given list of 'uris' issuing HTTP GET requests, attempting to download the content of each URI. The download is done using both the default and
the fallback channels, until one of them succeeds. The 'direct_download' and 'hgap_download' functions define the logic to do direct calls to the URI or
to use the HostGAPlugin as a proxy for the download. Initially the default channel is the direct download and the fallback channel is the HostGAPlugin,
but the default can be depending on the success/failure of each channel (see _download_using_appropriate_channel() for the logic to do this).
The 'download_type' is added to any log messages produced by this method; it should describe the type of content of the given URIs
(e.g. "manifest", "extension package, "agent package", etc).
When the download is successful, _download_with_fallback_channel invokes the 'on_downloaded' function, which can be used to process the results of the download. This
function should return True on success, and False on failure (it should not raise any exceptions). If the return value is False, the download is considered
a failure and the next URI is tried.
When the download succeeds, this method returns a (uri, response) tuple where the first item is the URI of the successful download and the second item is
the response returned by the successful channel (i.e. one of direct_download and hgap_download).
This method enforces a timeout (_DOWNLOAD_TIMEOUT) on the download and raises an exception if the limit is exceeded.
"""
logger.info("Downloading {0}", download_type)
start_time = datetime.now()
uris_shuffled = uris
random.shuffle(uris_shuffled)
most_recent_error = "None"
for index, uri in enumerate(uris_shuffled):
elapsed = datetime.now() - start_time
if elapsed > _DOWNLOAD_TIMEOUT:
message = "Timeout downloading {0}. Elapsed: {1} URIs tried: {2}/{3}. Last error: {4}".format(download_type, elapsed, index, len(uris), ustr(most_recent_error))
raise ExtensionDownloadError(message, code=ExtensionErrorCodes.PluginManifestDownloadError)
try:
# Disable W0640: OK to use uri in a lambda within the loop's body
response = self._download_using_appropriate_channel(lambda: direct_download(uri), lambda: hgap_download(uri)) # pylint: disable=W0640
if on_downloaded is not None:
on_downloaded()
return uri, response
except Exception as exception:
most_recent_error = exception
raise ExtensionDownloadError("Failed to download {0} from all URIs. Last error: {1}".format(download_type, ustr(most_recent_error)), code=ExtensionErrorCodes.PluginManifestDownloadError)
@staticmethod
def _try_expand_zip_package(package_type, target_file, target_directory):
logger.info("Unzipping {0}: {1}", package_type, target_file)
try:
zipfile.ZipFile(target_file).extractall(target_directory)
except Exception as exception:
logger.error("Error while unzipping {0}: {1}", package_type, ustr(exception))
if os.path.exists(target_directory):
try:
shutil.rmtree(target_directory)
except Exception as rmtree_exception:
logger.warn("Cannot delete {0}: {1}", target_directory, ustr(rmtree_exception))
raise
finally:
try:
os.remove(target_file)
except Exception as exception:
logger.warn("Cannot delete {0}: {1}", target_file, ustr(exception))
def stream(self, uri, destination, headers=None, use_proxy=None):
"""
Downloads the content of the given 'uri' and saves it to the 'destination' file.
"""
try:
logger.verbose("Fetch [{0}] with headers [{1}] to file [{2}]", uri, headers, destination)
response = self._fetch_response(uri, headers, use_proxy)
if response is not None and not restutil.request_failed(response):
chunk_size = 1024 * 1024 # 1MB buffer
with open(destination, 'wb', chunk_size) as destination_fh:
complete = False
while not complete:
chunk = response.read(chunk_size)
destination_fh.write(chunk)
complete = len(chunk) < chunk_size
return ""
except:
if os.path.exists(destination): # delete the destination file, in case we did a partial download
try:
os.remove(destination)
except Exception as exception:
logger.warn("Can't delete {0}: {1}", destination, ustr(exception))
raise
def fetch(self, uri, headers=None, use_proxy=None, decode=True, retry_codes=None, ok_codes=None):
"""
Returns a tuple with the content and headers of the response. The headers are a list of (name, value) tuples.
"""
logger.verbose("Fetch [{0}] with headers [{1}]", uri, headers)
content = None
response_headers = None
response = self._fetch_response(uri, headers, use_proxy, retry_codes=retry_codes, ok_codes=ok_codes)
if response is not None and not restutil.request_failed(response, ok_codes=ok_codes):
response_content = response.read()
content = self.decode_config(response_content) if decode else response_content
response_headers = response.getheaders()
return content, response_headers
def _fetch_response(self, uri, headers=None, use_proxy=None, retry_codes=None, ok_codes=None):
resp = None
try:
resp = self.call_storage_service(
restutil.http_get,
uri,
headers=headers,
use_proxy=use_proxy,
retry_codes=retry_codes)
host_plugin = self.get_host_plugin()
if restutil.request_failed(resp, ok_codes=ok_codes):
error_response = restutil.read_response_error(resp)
msg = "Fetch failed from [{0}]: {1}".format(uri, error_response)
logger.warn(msg)
if host_plugin is not None:
host_plugin.report_fetch_health(uri,
is_healthy=not restutil.request_failed_at_hostplugin(resp),
source='WireClient',
response=error_response)
raise ProtocolError(msg)
else:
if host_plugin is not None:
host_plugin.report_fetch_health(uri, source='WireClient')
except (HttpError, ProtocolError, IOError) as error:
msg = "Fetch failed: {0}".format(error)
logger.warn(msg)
report_event(op=WALAEventOperation.HttpGet, is_success=False, message=msg, log_event=False)
raise
return resp
def update_host_plugin_from_goal_state(self):
"""
Fetches a new goal state and updates the Container ID and Role Config Name of the host plugin client
"""
if self._host_plugin is not None:
GoalState.update_host_plugin_headers(self)
def update_host_plugin(self, container_id, role_config_name):
if self._host_plugin is not None:
self._host_plugin.update_container_id(container_id)
self._host_plugin.update_role_config_name(role_config_name)
def update_goal_state(self, force_update=False, silent=False, save_to_history=False):
"""
Updates the goal state if the incarnation or etag changed
"""
try:
if self._goal_state is None:
self._goal_state = GoalState(self, silent=silent, save_to_history=save_to_history)
else:
self._goal_state.update(force_update=force_update, silent=silent)
except ProtocolError:
raise
except Exception as exception:
raise ProtocolError("Error fetching goal state: {0}".format(ustr(exception)))
def reset_goal_state(self, goal_state_properties=GoalStateProperties.All, silent=False, save_to_history=False):
"""
Resets the goal state
"""
try:
if not silent:
logger.info("Forcing an update of the goal state.")
self._goal_state = GoalState(self, goal_state_properties=goal_state_properties, silent=silent, save_to_history=save_to_history)
except ProtocolError:
raise
except Exception as exception:
raise ProtocolError("Error fetching goal state: {0}".format(ustr(exception)))
def get_goal_state(self):
if self._goal_state is None:
raise ProtocolError("Trying to fetch goal state before initialization!")
return self._goal_state
def get_hosting_env(self):
if self._goal_state is None:
raise ProtocolError("Trying to fetch Hosting Environment before initialization!")
return self._goal_state.hosting_env
def get_shared_conf(self):
if self._goal_state is None:
raise ProtocolError("Trying to fetch Shared Conf before initialization!")
return self._goal_state.shared_conf
def get_certs(self):
if self._goal_state is None:
raise ProtocolError("Trying to fetch Certificates before initialization!")
return self._goal_state.certs
def get_remote_access(self):
if self._goal_state is None:
raise ProtocolError("Trying to fetch Remote Access before initialization!")
return self._goal_state.remote_access
def check_wire_protocol_version(self):
uri = VERSION_INFO_URI.format(self.get_endpoint())
version_info_xml = self.fetch_config(uri, None)
version_info = VersionInfo(version_info_xml)
preferred = version_info.get_preferred()
if PROTOCOL_VERSION == preferred:
logger.info("Wire protocol version:{0}", PROTOCOL_VERSION)
elif PROTOCOL_VERSION in version_info.get_supported():
logger.info("Wire protocol version:{0}", PROTOCOL_VERSION)
logger.info("Server preferred version:{0}", preferred)
else:
error = ("Agent supported wire protocol version: {0} was not "
"advised by Fabric.").format(PROTOCOL_VERSION)
raise ProtocolNotFoundError(error)
def _call_hostplugin_with_container_check(self, host_func):
"""
Calls host_func on host channel and accounts for stale resource (ResourceGoneError or InvalidContainerError).
If stale, it refreshes the goal state and retries host_func.
"""
try:
return host_func()
except (ResourceGoneError, InvalidContainerError) as error:
host_plugin = self.get_host_plugin()
old_container_id, old_role_config_name = host_plugin.container_id, host_plugin.role_config_name
msg = "[PERIODIC] Request failed with the current host plugin configuration. " \
"ContainerId: {0}, role config file: {1}. Fetching new goal state and retrying the call." \
"Error: {2}".format(old_container_id, old_role_config_name, ustr(error))
logger.periodic_info(logger.EVERY_SIX_HOURS, msg)
self.update_host_plugin_from_goal_state()
new_container_id, new_role_config_name = host_plugin.container_id, host_plugin.role_config_name
msg = "[PERIODIC] Host plugin reconfigured with new parameters. " \
"ContainerId: {0}, role config file: {1}.".format(new_container_id, new_role_config_name)
logger.periodic_info(logger.EVERY_SIX_HOURS, msg)
try:
ret = host_func()
msg = "[PERIODIC] Request succeeded using the host plugin channel after goal state refresh. " \
"ContainerId changed from {0} to {1}, " \
"role config file changed from {2} to {3}.".format(old_container_id, new_container_id,
old_role_config_name, new_role_config_name)
add_periodic(delta=logger.EVERY_SIX_HOURS,
name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.HostPlugin,
is_success=True,
message=msg,
log_event=True)
return ret
except (ResourceGoneError, InvalidContainerError) as host_error:
msg = "[PERIODIC] Request failed using the host plugin channel after goal state refresh. " \
"ContainerId changed from {0} to {1}, role config file changed from {2} to {3}. " \
"Exception type: {4}.".format(old_container_id, new_container_id, old_role_config_name,
new_role_config_name, type(host_error).__name__)
add_periodic(delta=logger.EVERY_SIX_HOURS,
name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.HostPlugin,
is_success=False,
message=msg,
log_event=True)
raise
def _download_using_appropriate_channel(self, direct_download, hgap_download):
"""
Does a download using both the default and fallback channels. By default, the primary channel is direct, host channel is the fallback.
We call the primary channel first and return on success. If primary fails, we try the fallback. If fallback fails,
we return and *don't* switch the default channel. If fallback succeeds, we change the default channel.
"""
hgap_download_function_with_retry = lambda: self._call_hostplugin_with_container_check(hgap_download)
if HostPluginProtocol.is_default_channel:
primary_channel, secondary_channel = hgap_download_function_with_retry, direct_download
else:
primary_channel, secondary_channel = direct_download, hgap_download_function_with_retry
try:
return primary_channel()
except Exception as exception:
primary_channel_error = exception
try:
return_value = secondary_channel()
# Since the secondary channel succeeded, flip the default channel
HostPluginProtocol.is_default_channel = not HostPluginProtocol.is_default_channel
message = "Default channel changed to {0} channel.".format("HostGAPlugin" if HostPluginProtocol.is_default_channel else "Direct")
logger.info(message)
add_event(AGENT_NAME, op=WALAEventOperation.DefaultChannelChange, version=CURRENT_VERSION, is_success=True, message=message, log_event=False)
return return_value
except Exception as exception:
raise HttpError("Download failed both on the primary and fallback channels. Primary: [{0}] Fallback: [{1}]".format(ustr(primary_channel_error), ustr(exception)))
def upload_status_blob(self):
extensions_goal_state = self.get_goal_state().extensions_goal_state
if extensions_goal_state.status_upload_blob is None:
# the status upload blob is in ExtensionsConfig so force a full goal state refresh
self.reset_goal_state(silent=True, save_to_history=True)
extensions_goal_state = self.get_goal_state().extensions_goal_state
if extensions_goal_state.status_upload_blob is None:
raise ProtocolNotFoundError("Status upload uri is missing")
logger.info("Refreshed the goal state to get the status upload blob. New Goal State ID: {0}", extensions_goal_state.id)
blob_type = extensions_goal_state.status_upload_blob_type
try:
self.status_blob.prepare(blob_type)
except Exception as e:
raise ProtocolError("Exception creating status blob: {0}".format(ustr(e)))
# Swap the order of use for the HostPlugin vs. the "direct" route.
# Prefer the use of HostPlugin. If HostPlugin fails fall back to the
# direct route.
#
# The code previously preferred the "direct" route always, and only fell back
# to the HostPlugin *if* there was an error. We would like to move to
# the HostPlugin for all traffic, but this is a big change. We would like
# to see how this behaves at scale, and have a fallback should things go
# wrong. This is why we try HostPlugin then direct.
try:
host = self.get_host_plugin()
host.put_vm_status(self.status_blob, extensions_goal_state.status_upload_blob, extensions_goal_state.status_upload_blob_type)
return
except ResourceGoneError:
# refresh the host plugin client and try again on the next iteration of the main loop
self.update_host_plugin_from_goal_state()
return
except Exception as e:
# for all other errors, fall back to direct
msg = "Falling back to direct upload: {0}".format(ustr(e))
self.report_status_event(msg, is_success=True)
try:
if self.status_blob.upload(extensions_goal_state.status_upload_blob):
return
except Exception as e:
msg = "Exception uploading status blob: {0}".format(ustr(e))
self.report_status_event(msg, is_success=False)
raise ProtocolError("Failed to upload status blob via either channel")
def report_role_prop(self, thumbprint):
goal_state = self.get_goal_state()
role_prop = _build_role_properties(goal_state.container_id,
goal_state.role_instance_id,
thumbprint)
role_prop = role_prop.encode("utf-8")
role_prop_uri = ROLE_PROP_URI.format(self.get_endpoint())
headers = self.get_header_for_xml_content()
try:
resp = self.call_wireserver(restutil.http_post,
role_prop_uri,
role_prop,
headers=headers)
except HttpError as e:
raise ProtocolError((u"Failed to send role properties: "
u"{0}").format(e))
if resp.status != httpclient.ACCEPTED:
raise ProtocolError((u"Failed to send role properties: "
u",{0}: {1}").format(resp.status,
resp.read()))
def report_health(self, status, substatus, description):
goal_state = self.get_goal_state()
health_report = _build_health_report(goal_state.incarnation,
goal_state.container_id,
goal_state.role_instance_id,
status,
substatus,
description)
health_report = health_report.encode("utf-8")
health_report_uri = HEALTH_REPORT_URI.format(self.get_endpoint())
headers = self.get_header_for_xml_content()
try:
# 30 retries with 10s sleep gives ~5min for wireserver updates;
# this is retried 3 times with 15s sleep before throwing a
# ProtocolError, for a total of ~15min.
resp = self.call_wireserver(restutil.http_post,
health_report_uri,
health_report,
headers=headers,
max_retry=30,
retry_delay=15)
except HttpError as e:
raise ProtocolError((u"Failed to send provision status: "
u"{0}").format(e))
if restutil.request_failed(resp):
raise ProtocolError((u"Failed to send provision status: "
u",{0}: {1}").format(resp.status,
resp.read()))
def _send_encoded_event(self, provider_id, event_str, flush, encoding='utf8'):
uri = TELEMETRY_URI.format(self.get_endpoint(), TELEMETRY_DATA)
data_format_header = ustr('<?xml version="1.0"?><TelemetryData version="1.0"><Provider id="{0}">').format(
provider_id).encode(encoding)
data_format_footer = ustr('</Provider></TelemetryData>').encode(encoding)
# Event string should already be encoded by the time it gets here, to avoid double encoding,
# dividing it into parts.
data = data_format_header + event_str + data_format_footer
try:
header = self.get_header_for_xml_content()
# NOTE: The call to wireserver requests utf-8 encoding in the headers, but the body should not
# be encoded: some nodes in the telemetry pipeline do not support utf-8 encoding.
# if it's important event flush, we use less throttle delay(to avoid long delay to complete this operation)) on throttling errors
if flush:
resp = self.call_wireserver(restutil.http_post, uri, data, header, max_retry=3, throttle_delay=TELEMETRY_FLUSH_THROTTLE_DELAY_IN_SECONDS)
else:
resp = self.call_wireserver(restutil.http_post, uri, data, header, max_retry=3, throttle_delay=TELEMETRY_THROTTLE_DELAY_IN_SECONDS)
except HttpError as e:
raise ProtocolError("Failed to send events:{0}".format(e))
if restutil.request_failed(resp):
logger.verbose(resp.read())
raise ProtocolError(
"Failed to send events:{0}".format(resp.status))
def report_event(self, events_iterator, flush=False):
buf = {}
debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_REPORT)
events_per_provider = defaultdict(int)
def _send_event(provider_id, debug_info, flush):
try:
self._send_encoded_event(provider_id, buf[provider_id], flush)
except UnicodeError as uni_error:
debug_info.update_unicode_error(uni_error)
except Exception as error:
debug_info.update_op_error(error)
# Group events by providerId
for event in events_iterator:
try:
if event.providerId not in buf:
buf[event.providerId] = b""
event_str = event_to_v1_encoded(event)
if len(event_str) >= MAX_EVENT_BUFFER_SIZE:
# Ignore single events that are too large to send out
details_of_event = [ustr(x.name) + ":" + ustr(x.value) for x in event.parameters if x.name in
[GuestAgentExtensionEventsSchema.Name, GuestAgentExtensionEventsSchema.Version,
GuestAgentExtensionEventsSchema.Operation,
GuestAgentExtensionEventsSchema.OperationSuccess]]
logger.periodic_warn(logger.EVERY_HALF_HOUR,
"Single event too large: {0}, with the length: {1} more than the limit({2})"
.format(str(details_of_event), len(event_str), MAX_EVENT_BUFFER_SIZE))
continue
# If buffer is full, send out the events in buffer and reset buffer
if len(buf[event.providerId] + event_str) >= MAX_EVENT_BUFFER_SIZE:
logger.verbose("No of events this request = {0}".format(events_per_provider[event.providerId]))
_send_event(event.providerId, debug_info, flush)
buf[event.providerId] = b""
events_per_provider[event.providerId] = 0
# Add encoded events to the buffer
buf[event.providerId] = buf[event.providerId] + event_str
events_per_provider[event.providerId] += 1
except Exception as error:
logger.warn("Unexpected error when generating Events:{0}", textutil.format_exception(error))
# Send out all events left in buffer.
for provider_id in list(buf.keys()):
if buf[provider_id]:
logger.verbose("No of events this request = {0}".format(events_per_provider[provider_id]))
_send_event(provider_id, debug_info, flush)
debug_info.report_debug_info()
return debug_info.get_error_count() == 0
def report_status_event(self, message, is_success):
report_event(op=WALAEventOperation.ReportStatus,
is_success=is_success,
message=message,
log_event=not is_success)
def get_header(self):
return {
"x-ms-agent-name": "WALinuxAgent",
"x-ms-version": PROTOCOL_VERSION
}
def get_header_for_xml_content(self):
return {
"x-ms-agent-name": "WALinuxAgent",
"x-ms-version": PROTOCOL_VERSION,
"Content-Type": "text/xml;charset=utf-8"
}
def get_header_for_cert(self):
return self._get_header_for_encrypted_request("DES_EDE3_CBC")
def get_header_for_remote_access(self):
return self._get_header_for_encrypted_request("AES128_CBC")
def _get_header_for_encrypted_request(self, cypher):
trans_cert_file = os.path.join(conf.get_lib_dir(), TRANSPORT_CERT_FILE_NAME)
try:
content = fileutil.read_file(trans_cert_file)
except IOError as e:
raise ProtocolError("Failed to read {0}: {1}".format(trans_cert_file, e))
cert = get_bytes_from_pem(content)
return {
"x-ms-agent-name": "WALinuxAgent",
"x-ms-version": PROTOCOL_VERSION,
"x-ms-cipher-name": cypher,
"x-ms-guest-agent-public-x509-cert": cert
}
def get_host_plugin(self):
if self._host_plugin is None:
self._host_plugin = HostPluginProtocol(self.get_endpoint())
GoalState.update_host_plugin_headers(self)
return self._host_plugin
def get_on_hold(self):
return self.get_goal_state().extensions_goal_state.on_hold
def upload_logs(self, content):
host = self.get_host_plugin()
return host.put_vm_log(content)
class VersionInfo(object):
def __init__(self, xml_text):
"""
Query endpoint server for wire protocol version.
Fail if our desired protocol version is not seen.
"""
logger.verbose("Load Version.xml")
self.parse(xml_text)
def parse(self, xml_text):
xml_doc = parse_doc(xml_text)
preferred = find(xml_doc, "Preferred")
self.preferred = findtext(preferred, "Version")
logger.info("Fabric preferred wire protocol version:{0}",
self.preferred)
self.supported = []
supported = find(xml_doc, "Supported")
supported_version = findall(supported, "Version")
for node in supported_version:
version = gettext(node)
logger.verbose("Fabric supported wire protocol version:{0}",
version)
self.supported.append(version)
def get_preferred(self):
return self.preferred
def get_supported(self):
return self.supported
# Do not extend this class
class InVMArtifactsProfile(object):
"""
deserialized json string of InVMArtifactsProfile.
It is expected to contain the following fields:
* inVMArtifactsProfileBlobSeqNo
* profileId (optional)
* onHold (optional)
* certificateThumbprint (optional)
* encryptedHealthChecks (optional)
* encryptedApplicationProfile (optional)
"""
def __init__(self, artifacts_profile):
if not textutil.is_str_empty(artifacts_profile):
self.__dict__.update(parse_json(artifacts_profile))
def is_on_hold(self):
# hasattr() is not available in Python 2.6
if 'onHold' in self.__dict__:
return str(self.onHold).lower() == 'true' # pylint: disable=E1101
return False