func()

in pkg/providers/instance/instance.go [76:151]


func (p *Provider) Create(ctx context.Context, nodeClaim *karpenterv1.NodeClaim) (*Instance, error) {
	klog.InfoS("Instance.Create", "nodeClaim", klog.KObj(nodeClaim))

	// We made a strong assumption here. The nodeClaim name should be a valid agent pool name without "-".
	apName := nodeClaim.Name
	if !AgentPoolNameRegex.MatchString(apName) {
		//https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/aks-common-issues-faq#what-naming-restrictions-are-enforced-for-aks-resources-and-parameters-
		return nil, fmt.Errorf("agentpool name(%s) is invalid, must match regex pattern: ^[a-z][a-z0-9]{0,11}$", apName)
	}

	var ap *armcontainerservice.AgentPool
	err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
		return false
	}, func() error {
		instanceTypes := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...).Get("node.kubernetes.io/instance-type").Values()
		if len(instanceTypes) == 0 {
			return fmt.Errorf("nodeClaim spec has no requirement for instance type")
		}

		vmSize := instanceTypes[0]
		apObj, apErr := newAgentPoolObject(vmSize, nodeClaim)
		if apErr != nil {
			return apErr
		}

		logging.FromContext(ctx).Debugf("creating Agent pool %s (%s)", apName, vmSize)
		var err error
		ap, err = createAgentPool(ctx, p.azClient.agentPoolsClient, p.resourceGroup, apName, p.clusterName, apObj)
		if err != nil {
			switch {
			case strings.Contains(err.Error(), "Operation is not allowed because there's an in progress create node pool operation"):
				// when gpu-provisioner restarted after crash for unknown reason, we may come across this error that agent pool creating
				// is in progress, so we just need to wait node ready based on the apObj.
				ap = &apObj
				return nil
			default:
				logging.FromContext(ctx).Errorf("failed to create agent pool for nodeclaim(%s), %v", nodeClaim.Name, err)
				return fmt.Errorf("agentPool.BeginCreateOrUpdate for %q failed: %w", apName, err)
			}
		}
		logging.FromContext(ctx).Debugf("created agent pool %s", *ap.ID)
		return nil
	})
	if err != nil {
		return nil, err
	}

	instance, err := p.fromRegisteredAgentPoolToInstance(ctx, ap)
	if instance == nil && err == nil {
		// means the node object has not been found yet, we wait until the node is created
		b := wait.Backoff{
			Steps:    15,
			Duration: 1 * time.Second,
			Factor:   1.0,
			Jitter:   0.1,
		}

		err = retry.OnError(b, func(err error) bool {
			return true
		}, func() error {
			var e error
			instance, e = p.fromRegisteredAgentPoolToInstance(ctx, ap)
			if e != nil {
				return e
			}
			if instance == nil {
				return fmt.Errorf("fail to find the node object")
			}
			return nil
		})
		if err != nil {
			return nil, err
		}
	}
	return instance, err
}