gke-windows-builder/builder/main.go (322 lines of code) (raw):
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"context"
"errors"
"flag"
"fmt"
"log"
"strings"
"sync"
"time"
"gke-windows-builder/builder/builder"
"github.com/masterzen/winrm"
"google.golang.org/api/googleapi"
)
var (
projectID = flag.String("project", "", "The project Id to use when creating the Windows Instance (uses gcloud default if not specified)")
workspacePath = flag.String("workspace-path", "/workspace", "The directory to copy data from")
workspaceBucket = flag.String("workspace-bucket", "", "The bucket to copy the directory to. Defaults to {project-id}_builder_tmp")
workspaceBucketLocation = flag.String("workspace-bucket-location", "", "The location of the bucket. Defaults to 'us' which is the GCS API default location'")
network = flag.String("network", "default", "The VPC network to use when creating the Windows Instance (uses 'default' if not specified)")
networkProject = flag.String("network-project", "", "The project where the VPC network is located (inferred if not specified).")
subnetwork = flag.String("subnetwork", "default", "The Subnetwork name to use when creating the Windows Instance")
subnetworkProject = flag.String("subnetwork-project", "", "(deprecated) The project where the Subnetwork is located (uses --network-project instead)")
region = flag.String("region", "us-central1", "The region to create the Windows Instance in (where the Subnetwork is located)")
zone = flag.String("zone", "us-central1-f", "The zone name to use when creating the Windows Instance")
labels = flag.String("labels", "", "List of label KEY=VALUE pairs separated by comma to add when creating the Windows Instance")
machineType = flag.String("machineType", "", "The machine type to use when creating the Windows Instance")
bootDiskType = flag.String("boot-disk-type", "pd-standard", "Windows instance boot disk type. Default value is pd-standard, other values include pd-ssd and pd-balanced")
bootDiskSizeGB = flag.Int64("boot-disk-size-GB", 75, "Instance boot disk size (in GB). Must be at least 40 GB")
copyTimeout = flag.Duration("copy-timeout", 5*time.Minute, "The workspace copy timeout in minutes")
serviceAccount = flag.String("serviceAccount", "default", "The service account to use when creating the Windows Instance")
containerImageName = flag.String("container-image-name", "", "The target container image:tag name")
pickedVersions = flag.String("versions", "", "List of Windows Server versions user wants to support. If not provided, the container will be built to support all Windows versions that GKE supports")
reuseBuilderInstances = flag.Bool("reuse-builder-instances", false, "Look for existing instances by labels and instance-name-prefix and reuse them for build, create new instance only if none were found. Avoid when queuing parallel builds.")
instanceNamePrefix = flag.String("instance-name-prefix", "windows-builder-", "Prefix to use for created GCE instances. Defaults to 'windows-builder-'")
testObsoleteVersion = flag.Bool("testonly-test-obsolete-versions", false, "If true, verify the obsolete Windows versions won't fail the builder. For testing purposes only")
setupTimeout = flag.Duration("setup-timeout", 20*time.Minute, "Time out to wait for Windows instance to be ready for winrm connection and Docker setup")
useInternalIP = flag.Bool("use-internal-ip", false, "Use internal IP addresses (for shared VPCs), also implies no need for firewall rules")
ExternalIP = flag.Bool("external-ip", true, "Create external IP addresses for VMs, If false then Cloud NAT must be enabled, see README for details.")
skipFirewallCheck = flag.Bool("skip-firewall-check", false, "Skip checking that the project has a firewall rule permitting WinRM ingress")
// Windows version and GCE container image family map
// Note:
// 1. Mapping between version <-> image family name, NOT specific image name
// 2. The version name need to match with servercore container version in Dockerfile file
versionMap = map[string]string{
"2004": "windows-cloud/global/images/family/windows-2004-core",
"20H2": "windows-cloud/global/images/family/windows-20h2-core",
"ltsc2019": "windows-cloud/global/images/family/windows-2019-core",
"ltsc2022": "windows-cloud/global/images/family/windows-2022-core",
}
commandTimeout = 10 * time.Minute
)
type buildArgsArray []string
var buildArgs buildArgsArray
func (i *buildArgsArray) String() string {
return "my string representation"
}
func (i *buildArgsArray) Set(value string) error {
*i = append(*i, value)
return nil
}
// builderServerStatus contains builder server and associated error.
type builderServerStatus struct {
s *builder.Server
err error
}
func main() {
log.Print("Starting Windows multi-arch container builder")
flag.Var(&buildArgs, "build-arg", "The list of parameters to pass to the docker build command")
flag.Parse()
if *containerImageName == "" {
log.Fatalf("Error container-image-name flag is required but was not set")
}
if *networkProject != "" && *subnetworkProject != "" && *networkProject != *subnetworkProject {
log.Fatalf("When both network and subnetwork projects are set, they must be identical")
}
// subnetworkProject is deprecated. If only subnetwork was set, then copy its value to networkProject
if *subnetworkProject != "" && *networkProject == "" {
*networkProject = *subnetworkProject
}
pickedVersionMap := getPickedVersionMap(*pickedVersions)
// Add obsolete 1809 version for test
if *testObsoleteVersion {
pickedVersionMap["1809"] = "windows-cloud/global/images/family/windows-1809-core-for-containers"
}
var err error
// Fetch builder project ID from metadata or gcloud command, if it's not set in flags
if *projectID == "" {
if *projectID, err = builder.GetProject(); err != nil {
log.Fatalf("Failed to get builder project ID: %+v", err)
}
}
if *workspaceBucket == "" {
*workspaceBucket = *projectID + "_builder_tmp"
}
if err = setupProjectForBuilder(context.Background()); err != nil {
log.Fatalf("Failed to setup builder project with error: %+v", err)
}
if err = process(pickedVersionMap); err != nil {
log.Fatalf("Windows multi-arch container building process failed with error: %+v", err)
}
log.Println("Windows multi-arch container building process is completed")
}
func setupProjectForBuilder(ctx context.Context) error {
var err error
if err = builder.NewGCSBucketIfNotExists(ctx, *projectID, *workspaceBucket, *workspaceBucketLocation); err != nil {
return fmt.Errorf("Failed creating bucket: %v, with error: %+v", *workspaceBucket, err)
}
if *useInternalIP {
log.Printf("Using a VM without an external IP. Make sure your build is using a worker pool connected to the specified network.")
}
if *skipFirewallCheck {
log.Printf("skipping checks that WinRM firewall rules exist")
return nil
}
return builder.CheckProjectFirewalls(ctx, builder.NewInstanceNetworkConfig(projectID, network, networkProject, subnetwork, region))
}
// Main building process
func process(pickedVersionMap map[string]string) error {
var bss []builderServerStatus
defer func() {
shutdownBuildServers(bss)
}()
if err := buildSingleArchContainers(pickedVersionMap, &bss); err != nil {
return err
}
if err := buildMultiArchContainer(pickedVersionMap, bss); err != nil {
return err
}
return nil
}
// Bring up Windows Build Servers & build single-arch containers in parallel
func buildSingleArchContainers(pickedVersionMap map[string]string, bss *[]builderServerStatus) error {
ch := make(chan builderServerStatus, len(pickedVersionMap))
wg := sync.WaitGroup{}
for ver, imageFamily := range pickedVersionMap {
wg.Add(1)
go func(ver string, imageFamily string) {
defer wg.Done()
ctx := context.Background()
ch <- buildSingleArchContainer(ctx, ver, imageFamily)
}(ver, imageFamily)
}
// Wait until all builder server statuses returned.
wg.Wait()
chLen := len(ch)
if chLen != len(pickedVersionMap) {
return fmt.Errorf("Unexpected discrepancy happened, the number of builder server statuses in channel is not equal to length of pickedVersionMap")
}
for i := 0; i < chLen; i++ {
*bss = append(*bss, <-ch)
}
// If any fatal error happens, exit the process
for _, bs := range *bss {
if bs.err != nil {
return fmt.Errorf("Error happened when building single-arch containers: %+v", bs.err)
}
}
return nil
}
// Build multi-arch container on any available server.
// If the pickedVersionMap has obsolete image version, it's still working fine, as `docker manifest create` command is resilient for non-existing containers.
// E.g. `docker manifest create container container_1909 container_2019` works if container_1909 doesn't exist. The resulting multi-arch container will have the only manifest of container_2019.
func buildMultiArchContainer(pickedVersionMap map[string]string, bss []builderServerStatus) error {
var isManifestCreated bool
for _, bs := range bss {
if bs.s != nil && !isManifestCreated {
manifestCreateCmdArgs := constructArgsOfManifestCreateCommand(pickedVersionMap)
err := createMultiArchContainerOnRemote(&bs.s.RemoteWindowsServer, *containerImageName, manifestCreateCmdArgs, commandTimeout)
if err != nil {
log.Printf("Error executing createMultiArchContainerOnRemote on instance: %v, with error: %+v", *bs.s.RemoteWindowsServer.Hostname, err)
} else {
isManifestCreated = true
}
}
}
if !isManifestCreated {
return fmt.Errorf("Failed to create the final multi-arch manifest")
}
return nil
}
func shutdownBuildServers(bss []builderServerStatus) {
if *reuseBuilderInstances {
log.Printf("Keeping instances for reuse")
wg := sync.WaitGroup{}
for _, bsc := range bss {
if bsc.s != nil {
wg.Add(1)
go func(bsc builderServerStatus) {
defer wg.Done()
bsc.s.RemoteWindowsServer.CleanFolder()
}(bsc)
}
}
wg.Wait()
return
}
log.Printf("Deleting created instances")
wg := sync.WaitGroup{}
for _, bsc := range bss {
if bsc.s != nil {
wg.Add(1)
go func(bsc builderServerStatus) {
defer wg.Done()
bsc.s.DeleteInstance()
}(bsc)
}
}
wg.Wait()
}
// Brings up a Windows Server Instance, build single-arch container and return the buider status.
// If that status's err is nil, the server is still running.
// If err is non-nil, then the server has been stopped.
// So please be aware of cleaning up the running instances after calling this function.
func buildSingleArchContainer(ctx context.Context, ver string, imageFamily string) builderServerStatus {
var s *builder.Server
var err error
netConfig := builder.NewInstanceNetworkConfig(projectID, network, networkProject, subnetwork, region)
bsc := &builder.WindowsBuildServerConfig{
InstanceNamePrefix: instanceNamePrefix,
ImageVersion: &ver,
ImageURL: &imageFamily,
Zone: zone,
NetworkConfig: netConfig,
Labels: labels,
MachineType: machineType,
BootDiskType: bootDiskType,
BootDiskSizeGB: *bootDiskSizeGB,
ServiceAccount: serviceAccount,
UseInternalIP: *useInternalIP,
ExternalNAT: *ExternalIP,
ReuseInstance: *reuseBuilderInstances,
}
if *reuseBuilderInstances {
log.Printf("Looking for an exiting %s instance to reuse", ver)
s, err = builder.FindExistingInstance(ctx, bsc, *projectID)
}
if s == nil {
s, err = builder.NewServer(ctx, bsc, *projectID)
if err != nil {
if isImageNotFoundErr(err, imageFamily) {
log.Printf("Failed to create Windows %[1]s instance, it may be expired, so skip it to continue without stamping Windows %[1]s manifest", ver)
return builderServerStatus{nil, nil}
}
return builderServerStatus{nil, err}
}
}
r := &s.RemoteWindowsServer
log.Printf("Waiting for Windows %s instance: %s (%s) to become available", ver, *r.Hostname, s.GetInstanceName())
err = r.WaitForServerBeReady(*setupTimeout)
if err != nil {
log.Printf("Error setup Windows %s instance: %s with error: %+v", ver, *r.Hostname, err)
return builderServerStatus{s, err}
}
r.WorkspaceBucket = workspaceBucket
// Copy workspace to remote machine
log.Printf("Copying local workspace to remote machine: %v", *r.Hostname)
err = r.Copy(*workspacePath, *copyTimeout)
if err != nil {
log.Printf("Error copying workspace to %v : %+v", *r.Hostname, err)
return builderServerStatus{s, err}
}
err = buildSingleArchContainerOnRemote(r, *containerImageName, ver, commandTimeout)
if err != nil {
log.Printf("Error building single arch container on remote %v : %+v", *r.Hostname, err)
return builderServerStatus{s, err}
}
return builderServerStatus{s, nil}
}
// Get the version map for picked versions
// If picked versions are empty, get the default full version map.
func getPickedVersionMap(pickedVersions string) map[string]string {
var pickedVersionMap = map[string]string{}
// If picked versions flag is not set, use the default full version map.
if pickedVersions == "" {
return versionMap
}
vers := strings.Split(pickedVersions, ",")
for _, ver := range vers {
ver = strings.TrimSpace(ver)
if ver != "" {
if versionMap[ver] == "" {
log.Fatalf("picked-versions flag has unsupported Windows Server versions: %s", ver)
}
pickedVersionMap[ver] = versionMap[ver]
}
}
if len(pickedVersionMap) == 0 {
log.Fatalf("no supported Windows Server versions found")
}
return pickedVersionMap
}
// Check if the error is image not found error.
func isImageNotFoundErr(err error, imageFamily string) bool {
var gceAPIErr *googleapi.Error
if errors.As(err, &gceAPIErr) {
// Image not found error sample:
// googleapi: Error 404: The resource 'projects/windows-cloud/global/images/family/windows-1809-core-for-containers' was not found
if gceAPIErr.Code == 404 && strings.Contains(gceAPIErr.Message, imageFamily) {
return true
}
}
return false
}
// Construct the args of `docker manifest create` cmd
// e.g. `docker manifest create demo:cloudbuild demo:cloudbuild_ltsc2019 demo:cloudbuild_1909`
func constructArgsOfManifestCreateCommand(pickedVersionMap map[string]string) string {
args := *containerImageName
for ver := range pickedVersionMap {
args += fmt.Sprint(" ", *containerImageName, "_", ver)
}
return args
}
func buildSingleArchContainerOnRemote(
r *builder.RemoteWindowsServer,
containerImageName string,
version string,
timeout time.Duration,
) error {
registry := strings.Split(containerImageName, "/")[0]
if registry == "gcr.io" {
registry = ""
}
buildargs := ""
for _, arg := range buildArgs {
buildargs += "--build-arg " + arg + " "
}
buildSingleArchContainerScript := fmt.Sprintf(`
$env:DOCKER_CLI_EXPERIMENTAL = 'enabled'
gcloud auth --quiet configure-docker %[3]s
docker build -t %[1]s_%[2]s --build-arg WINDOWS_VERSION=%[2]s %[4]s .
docker push %[1]s_%[2]s
`, containerImageName, version, registry, buildargs)
log.Printf("Start to build single-arch container with commands: %s", buildSingleArchContainerScript)
return r.RunCommand(winrm.Powershell(buildSingleArchContainerScript), *r.WorkspaceFolder, timeout)
}
// This function assumes that the remote server has already performed gcloud docker authentication.
// https://cloud.google.com/artifact-registry/docs/docker/authentication#gcloud-helper
func createMultiArchContainerOnRemote(
r *builder.RemoteWindowsServer,
containerImageName string,
manifestCreateCmdArgs string,
timeout time.Duration,
) error {
createMultiarchContainerScript := fmt.Sprintf(`
$env:DOCKER_CLI_EXPERIMENTAL = 'enabled'
docker manifest create %s
docker manifest push %s
`, manifestCreateCmdArgs, containerImageName)
log.Printf("Start to create multi-arch container with commands: %s", createMultiarchContainerScript)
return r.RunCommand(winrm.Powershell(createMultiarchContainerScript), *r.WorkspaceFolder, timeout)
}