cmd/get_logs.go (412 lines of code) (raw):
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
package cmd
import (
"context"
"fmt"
"net/url"
"os"
"path"
"path/filepath"
"regexp"
"strings"
"time"
"golang.org/x/text/cases"
"golang.org/x/text/language"
"github.com/Azure/aks-engine-azurestack/pkg/api"
"github.com/Azure/aks-engine-azurestack/pkg/helpers"
"github.com/Azure/aks-engine-azurestack/pkg/helpers/ssh"
"github.com/Azure/aks-engine-azurestack/pkg/i18n"
"github.com/Azure/aks-engine-azurestack/pkg/kubernetes"
"github.com/Azure/azure-storage-blob-go/azblob"
"github.com/leonelquinteros/gotext"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)
const (
getLogsName = "get-logs"
getLogsShortDescription = "Collect logs and current cluster nodes configuration."
getLogsLongDescription = "Collect deployment logs, running daemons/services logs and current nodes configuration."
)
const (
getLogsLinuxVHDScriptPath = "/opt/azure/containers/collect-logs.sh"
getLogsCustomLinuxScriptPath = "/tmp/collect-logs.sh"
getLogsWindowsVHDScriptPath = "c:\\k\\debug\\collect-windows-logs.ps1"
getLogsCustomWindowsScriptPath = "$env:temp\\collect-windows-logs.ps1"
getLogsUploadTimeout = 300 * time.Second
)
type getLogsCmd struct {
// user input
location string
apiModelPath string
sshHostURI string
linuxSSHPrivateKeyPath string
linuxScriptPath string
windowsScriptPath string
outputDirectory string
controlPlaneOnly bool
uploadSASURL string
nodeNames []string
// computed
cs *api.ContainerService
locale *gotext.Locale
linuxAuthConfig *ssh.AuthConfig
linuxVHDScript *ssh.RemoteFile
linuxCustomScript *ssh.RemoteFile
windowsAuthConfig *ssh.AuthConfig
windowsVHDScript *ssh.RemoteFile
windowsCustomScript *ssh.RemoteFile
jumpbox *ssh.JumpBox
}
func newGetLogsCmd() *cobra.Command {
glc := getLogsCmd{}
command := &cobra.Command{
Use: getLogsName,
Short: getLogsShortDescription,
Long: getLogsLongDescription,
RunE: func(cmd *cobra.Command, args []string) error {
if err := glc.validateArgs(); err != nil {
return errors.Wrap(err, "validating get-logs args")
}
if err := glc.loadAPIModel(); err != nil {
return errors.Wrap(err, "loading API model")
}
if err := glc.init(); err != nil {
return errors.Wrap(err, "loading API model")
}
cmd.SilenceUsage = true
return glc.run()
},
}
command.Flags().StringVarP(&glc.location, "location", "l", "", "Azure location where the cluster is deployed (required)")
command.Flags().StringVarP(&glc.apiModelPath, "api-model", "m", "", "path to the generated apimodel.json file (required)")
command.Flags().StringVar(&glc.sshHostURI, "ssh-host", "", "FQDN, or IP address, of an SSH listener that can reach all nodes in the cluster (required)")
command.Flags().StringVar(&glc.linuxSSHPrivateKeyPath, "linux-ssh-private-key", "", "path to a valid private SSH key to access the cluster's Linux nodes (required)")
command.Flags().StringVar(&glc.linuxScriptPath, "linux-script", "", "path to the log collection script to execute on the cluster's Linux nodes (required if distro is not aks-ubuntu-18.04)")
command.Flags().StringVar(&glc.windowsScriptPath, "windows-script", "", "path to the log collection script to execute on the cluster's Windows nodes (required if distro is not aks-windows)")
command.Flags().StringVarP(&glc.outputDirectory, "output-directory", "o", "", "collected logs destination directory, derived from --api-model if missing")
command.Flags().BoolVarP(&glc.controlPlaneOnly, "control-plane-only", "", false, "get logs from control plane VMs only")
command.Flags().StringVarP(&glc.uploadSASURL, "upload-sas-url", "", "", "Azure Storage Account SAS URL to upload the collected logs")
command.Flags().StringSliceVar(&glc.nodeNames, "vm-names", nil, "get logs from the VM name list only (comma-separated names)")
_ = command.MarkFlagRequired("location")
_ = command.MarkFlagRequired("api-model")
_ = command.MarkFlagRequired("ssh-host")
_ = command.MarkFlagRequired("linux-ssh-private-key")
return command
}
func (glc *getLogsCmd) validateArgs() (err error) {
if glc.locale, err = i18n.LoadTranslations(); err != nil {
return errors.Wrap(err, "loading translation files")
}
glc.location = helpers.NormalizeAzureRegion(glc.location)
if glc.location == "" {
return errors.New("--location must be specified")
}
if glc.sshHostURI == "" {
return errors.New("--ssh-host must be specified")
}
if glc.apiModelPath == "" {
return errors.New("--api-model must be specified")
} else if _, err := os.Stat(glc.apiModelPath); os.IsNotExist(err) {
return errors.Errorf("specified --api-model does not exist (%s)", glc.apiModelPath)
}
if glc.linuxSSHPrivateKeyPath == "" {
return errors.New("--linux-ssh-private-key must be specified")
} else if _, err := os.Stat(glc.linuxSSHPrivateKeyPath); os.IsNotExist(err) {
return errors.Errorf("specified --linux-ssh-private-key does not exist (%s)", glc.linuxSSHPrivateKeyPath)
}
if glc.linuxScriptPath != "" {
if _, err := os.Stat(glc.linuxScriptPath); os.IsNotExist(err) {
return errors.Errorf("specified --linux-script does not exist (%s)", glc.linuxScriptPath)
}
}
if glc.windowsScriptPath != "" {
if _, err := os.Stat(glc.windowsScriptPath); os.IsNotExist(err) {
return errors.Errorf("specified --windows-script does not exist (%s)", glc.windowsScriptPath)
}
}
if glc.outputDirectory == "" {
glc.outputDirectory = path.Join(filepath.Dir(glc.apiModelPath), "_logs")
if err := os.MkdirAll(glc.outputDirectory, 0755); err != nil {
return errors.Errorf("error creating output directory (%s)", glc.outputDirectory)
}
}
if glc.uploadSASURL != "" {
exp, err := regexp.Compile(`^/\w+`)
if err != nil {
return err
}
sasURL, err := url.ParseRequestURI(glc.uploadSASURL)
if err != nil {
return errors.Errorf("error parsing upload SAS URL")
}
if !exp.MatchString(sasURL.Path) {
return errors.New("invalid upload SAS URL format, expected 'https://{blob-service-uri}/{container-name}?{sas-token}'")
}
}
if glc.nodeNames != nil && len(glc.nodeNames) == 0 {
return errors.New("--vm-names cannot be empty")
}
if glc.nodeNames != nil && glc.controlPlaneOnly {
return errors.New("--control-plane-only and --vm-names are mutually exclusive")
}
return nil
}
func (glc *getLogsCmd) loadAPIModel() (err error) {
apiloader := &api.Apiloader{
Translator: &i18n.Translator{
Locale: glc.locale,
},
}
if glc.cs, _, err = apiloader.LoadContainerServiceFromFile(glc.apiModelPath, false, false, nil); err != nil {
return errors.Wrap(err, "error parsing api-model")
}
if glc.cs.Properties.IsCustomCloudProfile() {
if err = writeCustomCloudProfile(glc.cs); err != nil {
return errors.Wrap(err, "error writing custom cloud profile")
}
if err = glc.cs.Properties.SetCustomCloudSpec(api.AzureCustomCloudSpecParams{IsUpgrade: false, IsScale: true}); err != nil {
return errors.Wrap(err, "error parsing the api model")
}
}
if glc.cs.Location == "" {
glc.cs.Location = glc.location
} else if glc.cs.Location != glc.location {
return errors.New("--location flag does not match api-model location")
}
return
}
func (glc *getLogsCmd) init() (err error) {
if glc.linuxScriptPath != "" {
sc, err := os.ReadFile(glc.linuxScriptPath)
if err != nil {
return errors.Wrapf(err, "error reading log collection script %s", glc.linuxScriptPath)
}
glc.linuxCustomScript = &ssh.RemoteFile{
Path: getLogsCustomLinuxScriptPath, Permissions: "744", Owner: "root:root", Content: sc}
}
glc.linuxVHDScript = &ssh.RemoteFile{Path: getLogsLinuxVHDScriptPath}
glc.linuxAuthConfig = &ssh.AuthConfig{
User: glc.cs.Properties.LinuxProfile.AdminUsername,
PrivateKeyPath: glc.linuxSSHPrivateKeyPath,
}
if glc.windowsScriptPath != "" {
sc, err := os.ReadFile(glc.windowsScriptPath)
if err != nil {
return errors.Wrapf(err, "error reading log collection script %s", glc.windowsScriptPath)
}
glc.windowsCustomScript = &ssh.RemoteFile{
Path: getLogsCustomWindowsScriptPath, Permissions: "", Owner: "", Content: sc}
}
glc.windowsVHDScript = &ssh.RemoteFile{Path: getLogsWindowsVHDScriptPath}
if glc.cs.Properties.WindowsProfile != nil {
if glc.cs.Properties.WindowsProfile.GetSSHEnabled() {
glc.windowsAuthConfig = &ssh.AuthConfig{
User: glc.cs.Properties.WindowsProfile.AdminUsername,
Password: glc.cs.Properties.WindowsProfile.AdminPassword,
}
} else {
log.Warn("Skipping Windows nodes as SSH is not enabled")
}
}
glc.jumpbox = &ssh.JumpBox{
URI: glc.sshHostURI, Port: 22, OperatingSystem: api.Linux, AuthConfig: glc.linuxAuthConfig}
return
}
func (glc *getLogsCmd) run() error {
kubeClient, err := getKubeClient(glc.cs, 10*time.Second, 10*time.Minute)
if err != nil {
return errors.Wrap(err, "creating Kubernetes client")
}
nodes := getClusterNodes(glc, kubeClient)
nodeScripts := getClusterNodeScripts(glc, nodes)
if len(nodeScripts) == 0 {
log.Info("All nodes skipped")
return nil
}
for node, script := range nodeScripts {
err = collectLogs(glc, node, script)
if err != nil {
return err
}
}
log.Infof("Logs downloaded to %s", glc.outputDirectory)
if glc.uploadSASURL != "" {
for node := range nodeScripts {
err = uploadLogs(node, glc.outputDirectory, glc.uploadSASURL)
if err != nil {
log.Warnf("Error uploading %s logs", node.URI)
log.Debugf("Error: %s", err)
}
}
}
return err
}
// getClusterNodes returns the target node list
func getClusterNodes(glc *getLogsCmd, kubeClient kubernetes.NodeLister) (nodes []*ssh.RemoteHost) {
if glc.nodeNames != nil {
for _, nodeName := range glc.nodeNames {
if strings.HasPrefix(nodeName, api.DefaultOrchestratorName) {
log.Infof("Treating node %s as a Linux agent node", nodeName)
nodes = append(nodes, &ssh.RemoteHost{
URI: nodeName, Port: 22, OperatingSystem: api.Linux, AuthConfig: glc.linuxAuthConfig, Jumpbox: glc.jumpbox})
} else {
log.Infof("Treating node %s as a Windows agent node", nodeName)
if glc.windowsAuthConfig != nil {
nodes = append(nodes, &ssh.RemoteHost{
URI: nodeName, Port: 22, OperatingSystem: api.Windows, AuthConfig: glc.windowsAuthConfig, Jumpbox: glc.jumpbox})
} else {
log.Infof("Skipping node %s, WindowsProfile was not provided", nodeName)
}
}
}
return nodes
}
nodeList, err := kubeClient.ListNodes()
if err != nil {
log.Warnf("Error retrieving node list from apiserver: %s", err)
log.Info("Collecting logs from control plane nodes only")
for i := 0; i < glc.cs.Properties.MasterProfile.Count; i++ {
name := fmt.Sprintf("%s%d", glc.cs.Properties.GetMasterVMPrefix(), i)
nodes = append(nodes, &ssh.RemoteHost{
URI: name, Port: 22, OperatingSystem: api.Linux, AuthConfig: glc.linuxAuthConfig, Jumpbox: glc.jumpbox})
}
return nodes
}
for _, node := range nodeList.Items {
if isMasterNode(node.Name, glc.cs.Properties.GetMasterVMPrefix()) || !glc.controlPlaneOnly {
caser := cases.Title(language.English)
switch api.OSType(caser.String(node.Status.NodeInfo.OperatingSystem)) {
case api.Linux:
nodes = append(nodes, &ssh.RemoteHost{
URI: node.Name, Port: 22, OperatingSystem: api.Linux, AuthConfig: glc.linuxAuthConfig, Jumpbox: glc.jumpbox})
case api.Windows:
if glc.windowsAuthConfig != nil {
nodes = append(nodes, &ssh.RemoteHost{
URI: node.Name, Port: 22, OperatingSystem: api.Windows, AuthConfig: glc.windowsAuthConfig, Jumpbox: glc.jumpbox})
}
default:
log.Infof("Skipping node %s, could not determine operating system", node.Name)
}
}
}
return nodes
}
// getClusterNodeScripts maps target nodes with a log collection script
func getClusterNodeScripts(glc *getLogsCmd, nodes []*ssh.RemoteHost) map[*ssh.RemoteHost]*ssh.RemoteFile {
nodeScript := make(map[*ssh.RemoteHost]*ssh.RemoteFile)
poolHasScript := make(map[string]bool)
isWindowsSkipped := false
for _, node := range nodes {
switch node.OperatingSystem {
case api.Linux:
if isMasterNode(node.URI, glc.cs.Properties.GetMasterVMPrefix()) && glc.cs.Properties.MasterProfile.IsVHDDistro() {
nodeScript[node] = glc.linuxVHDScript
} else {
for i, pool := range glc.cs.Properties.AgentPoolProfiles {
if pool.IsVHDDistro() && glc.cs.Properties.IsAgentPoolMember(node.URI, pool, i) {
nodeScript[node] = glc.linuxVHDScript
}
}
}
if glc.linuxCustomScript != nil {
nodeScript[node] = glc.linuxCustomScript
}
_, ok := nodeScript[node]
poolName := strings.Split(node.URI, "-")[1]
poolHasScript[poolName] = ok
case api.Windows:
if glc.cs.Properties.WindowsProfile != nil && glc.cs.Properties.WindowsProfile.IsVHDDistro() {
nodeScript[node] = glc.windowsVHDScript
}
if glc.windowsCustomScript != nil {
nodeScript[node] = glc.windowsCustomScript
}
if _, ok := nodeScript[node]; !ok {
isWindowsSkipped = true
}
}
}
for pool, hasScript := range poolHasScript {
if !hasScript {
log.Warnf("Skipping node pool '%s' as flag '--linux-script' is not set and the pool distro is not aks-ubuntu-18.04", pool)
}
}
if isWindowsSkipped {
log.Warn("Skipping Windows nodes as flag '--windows-script' is not set and the profile distro is not aks-windows")
}
return nodeScript
}
// collectLogs uploads the log collection script (if needed), executes the script and downloads the collected logs
func collectLogs(glc *getLogsCmd, node *ssh.RemoteHost, script *ssh.RemoteFile) error {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
log.Infof("Processing node: %s", node.URI)
if script.Content != nil {
stdout, err := ssh.CopyToRemote(ctx, node, script)
if err != nil {
return errors.Wrap(err, stdout)
}
}
isAzureStack := glc.cs.Properties.IsAzureStackCloud()
stdout, err := ssh.ExecuteRemote(ctx, node, collectLogsScript(script, node.OperatingSystem, isAzureStack))
if err != nil {
return errors.Wrap(err, stdout)
}
src := fileToDownload(node.OperatingSystem, node.URI)
dst := path.Join(glc.outputDirectory, fmt.Sprintf("%s.zip", node.URI))
stdout, err = ssh.CopyFromRemote(ctx, node, src, dst)
if err != nil {
return errors.Wrap(err, stdout)
}
return err
}
// uploadLogs uploads collected logs to an azure storage account
func uploadLogs(node *ssh.RemoteHost, outputDirectory, uploadSASURL string) error {
log.Infof("Uploading %s logs", node.URI)
ctx, cancel := context.WithTimeout(context.Background(), getLogsUploadTimeout)
defer cancel()
fp := path.Join(outputDirectory, fmt.Sprintf("%s.zip", node.URI))
f, err := os.Open(fp)
if err != nil {
return errors.Wrapf(err, "reading file %s", fp)
}
sas, err := url.Parse(uploadSASURL)
if err != nil {
return errors.Wrap(err, "parsing upload SAS URL")
}
sas.Path = path.Join(sas.Path, fmt.Sprintf("%s.zip", node.URI))
_, err = uploadToSASURL(ctx, f, sas)
if err != nil {
return err
}
return nil
}
func uploadToSASURL(ctx context.Context, file *os.File, destination *url.URL) (azblob.CommonResponse, error) {
p := azblob.NewPipeline(azblob.NewAnonymousCredential(), azblob.PipelineOptions{})
u := azblob.NewBlobURL(*destination, p).ToBlockBlobURL()
cr, err := azblob.UploadFileToBlockBlob(ctx, file, u, azblob.UploadToBlockBlobOptions{})
if err != nil {
return nil, errors.Wrap(err, "uploading to storage account")
}
return cr, nil
}
func collectLogsScript(f *ssh.RemoteFile, os api.OSType, isAzureStack bool) string {
switch os {
case api.Linux:
if isAzureStack {
return fmt.Sprintf("sudo -E bash -c \"AZURE_ENV=AzureStackCloud %s\"", f.Path)
}
return fmt.Sprintf("sudo -E bash -c %s", f.Path)
case api.Windows:
return fmt.Sprintf("powershell -command \"iex %s; ls . | Where-Object { $_.extension -eq '.zip' } | sort LastWriteTime -Descending | Select -First 1 | Copy-Item -Destination $env:temp\\$env:computername.zip\"", f.Path)
default:
return ""
}
}
func fileToDownload(os api.OSType, nodeName string) *ssh.RemoteFile {
switch os {
case api.Linux:
return &ssh.RemoteFile{Path: "/tmp/logs.zip"}
case api.Windows:
return &ssh.RemoteFile{Path: fmt.Sprintf("%%TEMP%%\\%s.zip", nodeName)}
default:
return nil
}
}
func isMasterNode(vmName, masterPrefix string) bool {
return strings.HasPrefix(vmName, masterPrefix)
}