troubleshooting/create_snapshot.py (224 lines of code) (raw):
#!/usr/bin/env python3
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script can be used to prepare a bundle of files describing the state of a
GKE cluster.
Usage examples:
* Using default kubeconfig:
$ python3 create_snapshot.py
* Using selected config from a file:
$ python3 create_snapshot.py --kubeconfig /tmp/kubeconfig
* Specifying timeout for the kubectl calls (default is 15 seconds):
$ python3 create_snapshot.py --timeout 10
Output:
snapshot-{timestamp}.tar.gz file containing outputs of various kubectl commands
that were executed. The file is created in the current working directory.
Requirements:
* Python 3.8 (but probably work with lower versions of Python 3 too)
* kubectl available through $PATH
"""
import argparse
import os
import pathlib
import subprocess
import tarfile
import tempfile
import time
CMD_TIMEOUT_SEC = 15
BACKOFF_LIMIT = 4
KUBECTL_GLOBAL_CMDS = [
'kubectl version {kubeconfig_arg} --request-timeout {timeout}',
'kubectl cluster-info {kubeconfig_arg} --request-timeout {timeout}',
'kubectl get clusterroles -o wide {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get clusterrolebindings -o wide {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get crd -o wide {kubeconfig_arg} --request-timeout {timeout}',
'kubectl get nodes -o wide {kubeconfig_arg} --request-timeout {timeout}',
'kubectl get clusterroles -o yaml {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get clusterrolebindings -o yaml {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get crd -o yaml {kubeconfig_arg} --request-timeout {timeout}',
'kubectl get nodes -o yaml {kubeconfig_arg} --request-timeout {timeout}',
'kubectl describe clusterroles {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl describe clusterrolebindings {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl describe crd {kubeconfig_arg} --request-timeout {timeout}',
'kubectl describe nodes {kubeconfig_arg} --request-timeout {timeout}',
'kubectl get validatingwebhookconfigurations -o wide {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get validatingwebhookconfigurations -o yaml {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get mutatingwebhookconfigurations -o wide {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
'kubectl get mutatingwebhookconfigurations -o yaml {kubeconfig_arg} --request-timeout {timeout}', # noqa: E501
]
KUBECTL_PER_NS_CMDS = [
'kubectl get all -o wide {kubeconfig_arg} --request-timeout {timeout} --namespace {namespace}', # noqa: E501
'kubectl get all -o yaml {kubeconfig_arg} --request-timeout {timeout} --namespace {namespace}', # noqa: E501
'kubectl describe all {kubeconfig_arg} --request-timeout {timeout} --namespace {namespace}', # noqa: E501
]
KUBECTL_PER_POD_CMDS = [
'kubectl logs {kubeconfig_arg} {pod} --container {container} --request-timeout {timeout} --namespace {namespace}', # noqa: E501
]
def parse_args():
parser = argparse.ArgumentParser(
description='Create a snapshot of important information about Anthos K8'
' cluster to be used by GCP support.')
parser.add_argument('--kubeconfig',
dest='kubeconfig',
action='store',
default=os.getenv('KUBECONFIG', ''),
help='Path to kubeconfig file to be used to gather the'
'snapshot')
parser.add_argument('--timeout', dest='timeout', action='store',
default=CMD_TIMEOUT_SEC, type=int,
help='Timeout for kubectl commands.')
parser.add_argument('--upload-to',
action="store",
dest='bucket',
default='',
help='Upload the snapshot to a exsiting or new'
'Cloud Storage Bucket.', )
parser.add_argument('--service-account-key-file',
dest='sa_keyfile',
help='Path to service account key file for snapshot'
'Cloud Storage Bucket.')
args = parser.parse_args()
if args.bucket:
if not args.sa_keyfile:
parser.error('When uploading --service-account-key-file'
'must be provided.')
upload_preflight_return_code = upload_preflight(args.sa_keyfile,
args.bucket)
if upload_preflight_return_code:
parser.exit()
return args.kubeconfig, args.timeout, args.bucket
def upload_preflight(sa_keyfile: str, bucket: str): # noqa: E999
auth_cmd = ('gcloud auth activate-service-account'
'--key-file {}'.format(sa_keyfile))
create_cmd = 'gsutil mb -c standard --retention 30d gs://{}'.format(bucket)
list_bucket_cmd = 'gsutil ls -b gs://{}'.format(bucket)
print('Authenticating as service account from key file... ', end='')
auth_return_code, auth_output = run_gsutil_cmd(auth_cmd)
if not auth_return_code:
print('[ SUCCESS ]')
print('Checking if Cloud Storage Bucket exsists... ', end='')
list_bucket_return_code, _ = run_gsutil_cmd(list_bucket_cmd)
if not list_bucket_return_code:
print('[ SUCCESS ]')
# listing the bucket was sucessful
return list_bucket_return_code
else:
print('[ FAIL ]')
print(('Creating {}... '.format(bucket)), end='')
create_return_code, create_output = run_gsutil_cmd(create_cmd)
if not create_return_code:
print('[ SUCCESS ]')
# creating the bucket was successful
return create_return_code
else:
print('[ FAIL ]')
print(create_output)
return create_return_code
else:
print('[ FAIL ]')
print(auth_output)
return auth_return_code
def upload_file(bucket: str, snap_file: str): # noqa: E999
bucket_path = 'gs://{}'.format(bucket)
upload_cmd = 'gsutil cp {} {}'.format(snap_file, bucket_path)
print('Uploading snapshot to bucket... ', end='')
upload_return_code, upload_output = run_gsutil_cmd(upload_cmd)
if upload_return_code:
print('[ FAIL ]')
print(upload_output)
return upload_return_code
print('[ DONE ]')
print(upload_output)
def run_gsutil_cmd(command: str): # noqa: E999
try:
process = subprocess.run(command, shell=True, capture_output=True)
if process.returncode:
return process.returncode, process.stderr.decode('utf-8')
return process.returncode, process.stdout.decode('utf-8')
except subprocess.SubprocessError as e:
return 1, e
def run_cmd(cmd: str, subfolder: str, output_dir: pathlib.Path): # noqa: E999
output_path = output_dir / subfolder / cmd.replace(' ', '_')
output_path.parent.mkdir(parents=True, exist_ok=True)
print("Executing: {}... ".format(cmd), end='')
backoff_timer = 1
backoff_count = 0
with open(output_path, mode='w') as output_file:
while True:
if backoff_count > BACKOFF_LIMIT:
print('[ FAIL ]')
return
process = subprocess.run(cmd, stdout=output_file,
stderr=output_file,
timeout=60,
shell=True)
if not process.returncode:
print("[ DONE ]")
return
print("\nCommand failed, trying again in {}s. ' \
'Error output: {}".format(backoff_timer, process.stderr))
time.sleep(backoff_timer)
backoff_timer *= 2
backoff_count += 1
def get_kubectl_list(object_type, kubeconfig, timeout, namespace=None,
object_name='', jsonpath="{.items[*].metadata.name}"):
cmd = 'kubectl get {obj_type} {kubeconfig_arg} ' \
'--request-timeout {timeout} ' \
'-o jsonpath="{jsonpath}" {obj_name}'. \
format(kubeconfig_arg=kubeconfig,
jsonpath=jsonpath,
timeout=timeout,
obj_type=object_type, obj_name=object_name)
if namespace:
cmd = "{} -n {}".format(cmd, namespace)
backoff_timer = 1
backoff_count = 0
print("Executing: {}... ".format(cmd), end='')
while True:
if backoff_count > BACKOFF_LIMIT:
print('[ FAIL ]')
return
process = subprocess.run(cmd, shell=True,
capture_output=True)
if not process.returncode:
print("[ DONE ]")
obj_list = process.stdout.decode().strip().split(' ')
if '' in obj_list:
obj_list.remove('')
return obj_list
print("\nCommand failed, trying again in {}s. ' \
'Error output: {}".format(backoff_timer, process.stderr))
time.sleep(backoff_timer)
backoff_timer *= 2
backoff_count += 1
def main():
kubeconfig, timeout, bucket = parse_args()
timeout = "{}s".format(timeout)
if kubeconfig:
kubeconfig = \
'--kubeconfig {}'.format(pathlib.Path(kubeconfig).absolute())
namespaces_list = get_kubectl_list('namespaces', kubeconfig, timeout)
with tempfile.TemporaryDirectory() as tmp_dir:
output_dir = pathlib.PosixPath(tmp_dir)
for cmd in KUBECTL_GLOBAL_CMDS:
run_cmd(
cmd.format(kubeconfig_arg=kubeconfig, timeout=timeout),
'global', output_dir
)
for namespace in namespaces_list:
for cmd in KUBECTL_PER_NS_CMDS:
run_cmd(
cmd.format(
kubeconfig_arg=kubeconfig,
timeout=timeout,
namespace=namespace
),
'namespaces/{}'.format(namespace),
output_dir
)
for pod in get_kubectl_list('pods',
kubeconfig, timeout, namespace):
containers = get_kubectl_list('pod',
kubeconfig, timeout,
namespace=namespace,
jsonpath="{.spec.containers[*].name}", # noqa: E501
object_name=pod)
for container in containers:
for cmd in KUBECTL_PER_POD_CMDS:
run_cmd(
cmd.format(kubeconfig_arg=kubeconfig,
timeout=timeout,
namespace=namespace,
pod=pod,
container=container),
'namespaces/{}/{}'.format(namespace, pod),
output_dir
)
# Commands done
snapshot_name = 'snapshot-{}'.format(int(time.time()))
snap_file = tarfile.open('{}.tar.gz'.format(snapshot_name), 'w:gz')
snap_file.add(output_dir, snapshot_name)
snap_file.close()
print("Created snapshot: {}".format(snap_file.name))
if bucket:
upload_file(bucket, snap_file.name)
if __name__ == '__main__':
main()