in pkg/providers/instance/instance.go [76:151]
func (p *Provider) Create(ctx context.Context, nodeClaim *karpenterv1.NodeClaim) (*Instance, error) {
klog.InfoS("Instance.Create", "nodeClaim", klog.KObj(nodeClaim))
// We made a strong assumption here. The nodeClaim name should be a valid agent pool name without "-".
apName := nodeClaim.Name
if !AgentPoolNameRegex.MatchString(apName) {
//https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/aks-common-issues-faq#what-naming-restrictions-are-enforced-for-aks-resources-and-parameters-
return nil, fmt.Errorf("agentpool name(%s) is invalid, must match regex pattern: ^[a-z][a-z0-9]{0,11}$", apName)
}
var ap *armcontainerservice.AgentPool
err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
return false
}, func() error {
instanceTypes := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...).Get("node.kubernetes.io/instance-type").Values()
if len(instanceTypes) == 0 {
return fmt.Errorf("nodeClaim spec has no requirement for instance type")
}
vmSize := instanceTypes[0]
apObj, apErr := newAgentPoolObject(vmSize, nodeClaim)
if apErr != nil {
return apErr
}
logging.FromContext(ctx).Debugf("creating Agent pool %s (%s)", apName, vmSize)
var err error
ap, err = createAgentPool(ctx, p.azClient.agentPoolsClient, p.resourceGroup, apName, p.clusterName, apObj)
if err != nil {
switch {
case strings.Contains(err.Error(), "Operation is not allowed because there's an in progress create node pool operation"):
// when gpu-provisioner restarted after crash for unknown reason, we may come across this error that agent pool creating
// is in progress, so we just need to wait node ready based on the apObj.
ap = &apObj
return nil
default:
logging.FromContext(ctx).Errorf("failed to create agent pool for nodeclaim(%s), %v", nodeClaim.Name, err)
return fmt.Errorf("agentPool.BeginCreateOrUpdate for %q failed: %w", apName, err)
}
}
logging.FromContext(ctx).Debugf("created agent pool %s", *ap.ID)
return nil
})
if err != nil {
return nil, err
}
instance, err := p.fromRegisteredAgentPoolToInstance(ctx, ap)
if instance == nil && err == nil {
// means the node object has not been found yet, we wait until the node is created
b := wait.Backoff{
Steps: 15,
Duration: 1 * time.Second,
Factor: 1.0,
Jitter: 0.1,
}
err = retry.OnError(b, func(err error) bool {
return true
}, func() error {
var e error
instance, e = p.fromRegisteredAgentPoolToInstance(ctx, ap)
if e != nil {
return e
}
if instance == nil {
return fmt.Errorf("fail to find the node object")
}
return nil
})
if err != nil {
return nil, err
}
}
return instance, err
}