src/panels/KaitoPanel.ts (426 lines of code) (raw):
import { AuthorizationManagementClient } from "@azure/arm-authorization";
import { ContainerServiceClient, ManagedCluster } from "@azure/arm-containerservice";
import { FeatureClient } from "@azure/arm-features";
import { ManagedServiceIdentityClient } from "@azure/arm-msi";
import { GenericResource, ResourceManagementClient } from "@azure/arm-resources";
import { RestError } from "@azure/storage-blob";
import * as vscode from "vscode";
import * as k8s from "vscode-kubernetes-tools-api";
import { ReadyAzureSessionProvider } from "../auth/types";
import {
getAksClient,
getAuthorizationManagementClient,
getFeatureClient,
getManagedServiceIdentityClient,
getResourceManagementClient,
} from "../commands/utils/arm";
import { getManagedCluster } from "../commands/utils/clusters";
import { Errorable, failed, getErrorMessage } from "../commands/utils/errorable";
import { longRunning } from "../commands/utils/host";
import { invokeKubectlCommand } from "../commands/utils/kubectl";
import { createFederatedCredential, getIdentity } from "../commands/utils/managedServiceIdentity";
import { createRoleAssignment } from "../commands/utils/roleAssignments";
import { MessageHandler, MessageSink } from "../webview-contract/messaging";
import { InitialState, ToVsCodeMsgDef, ToWebViewMsgDef } from "../webview-contract/webviewDefinitions/kaito";
import { TelemetryDefinition } from "../webview-contract/webviewTypes";
import { BasePanel, PanelDataProvider } from "./BasePanel";
import { isPodReady, getKaitoPods } from "./utilities/KaitoHelpers";
const MAX_RETRY = 3;
let RETRY_COUNT = 0;
export class KaitoPanel extends BasePanel<"kaito"> {
constructor(extensionUri: vscode.Uri) {
super(extensionUri, "kaito", {
kaitoInstallProgressUpdate: null,
getWorkspaceResponse: null,
});
}
}
export class KaitoPanelDataProvider implements PanelDataProvider<"kaito"> {
private readonly featureClient: FeatureClient;
private readonly resourceManagementClient: ResourceManagementClient;
private readonly containerServiceClient: ContainerServiceClient;
private readonly authorizationClient: AuthorizationManagementClient;
private readonly managedServiceIdentityClient: ManagedServiceIdentityClient;
public constructor(
readonly clusterName: string,
readonly subscriptionId: string,
readonly resourceGroupName: string,
readonly armId: string,
readonly sessionProvider: ReadyAzureSessionProvider,
readonly filterKaitoPodNames: string[],
readonly kubectl: k8s.APIAvailable<k8s.KubectlV1>,
readonly kubeConfigFilePath: string,
readonly newtarget: unknown,
) {
this.clusterName = clusterName;
this.subscriptionId = subscriptionId;
this.resourceGroupName = resourceGroupName;
this.armId = armId;
this.featureClient = getFeatureClient(sessionProvider, this.subscriptionId);
this.resourceManagementClient = getResourceManagementClient(sessionProvider, this.subscriptionId);
this.containerServiceClient = getAksClient(sessionProvider, this.subscriptionId);
this.authorizationClient = getAuthorizationManagementClient(sessionProvider, this.subscriptionId);
this.managedServiceIdentityClient = getManagedServiceIdentityClient(sessionProvider, this.subscriptionId);
this.filterKaitoPodNames = filterKaitoPodNames;
this.newtarget = newtarget;
}
getTitle(): string {
return `Install KAITO`;
}
getInitialState(): InitialState {
return {
clusterName: this.clusterName,
subscriptionId: this.subscriptionId,
resourceGroupName: this.resourceGroupName,
};
}
getTelemetryDefinition(): TelemetryDefinition<"kaito"> {
return {
installKaitoRequest: true,
generateWorkspaceRequest: true,
};
}
getMessageHandler(webview: MessageSink<ToWebViewMsgDef>): MessageHandler<ToVsCodeMsgDef> {
return {
installKaitoRequest: () => {
this.handleKaitoInstallation(webview);
},
generateWorkspaceRequest: () => {
this.handleGenerateWorkspaceRequest();
},
};
}
private async handleGenerateWorkspaceRequest() {
vscode.commands.executeCommand("aks.aksKaitoCreateCRD", this.newtarget);
}
private async handleKaitoInstallation(webview: MessageSink<ToWebViewMsgDef>) {
// Get current json
const currentJson = await longRunning(`Get current cluster information.`, () => {
return this.resourceManagementClient.resources.getById(this.armId, "2023-08-01");
});
// Prevent KAITO installation on automatic clusters
const skuName = currentJson.sku?.name;
if (skuName === "Automatic") {
webview.postKaitoInstallProgressUpdate({
operationDescription: "Automatic Cluster Detected",
event: 3,
errorMessage:
"KAITO cannot be installed on automatic clusters. Please try installing KAITO on a standard cluster.",
});
return;
}
// Get the feature registration state
const getFeatureClientRegisterState = await longRunning(
`Getting the AIToolchainOperator registration state.`,
() => {
return this.featureClient.features.get("Microsoft.ContainerService", "AIToolchainOperatorPreview");
},
);
if (getFeatureClientRegisterState.properties?.state !== "Registered") {
// Register the feature
const featureRegistrationPoller = await longRunning(`Registering the AIToolchainOperator.`, () => {
return this.featureClient.features.register(
"Microsoft.ContainerService",
"AIToolchainOperatorPreview",
{},
);
});
if (featureRegistrationPoller.properties?.state !== "Registered") {
await longRunning(`Waiting for the AIToolchainOperator registration to complete.`, () => {
return this.registerKaitoFeature(webview);
});
}
}
// Install kaito enablement
const kaitoInstallationResult = await longRunning(
`Enabling the KAITO for cluster '${this.clusterName}'.`,
() => {
return this.handleKaitoInstallationLogic(currentJson, webview);
},
);
if (kaitoInstallationResult && failed(kaitoInstallationResult)) {
vscode.window.showErrorMessage(
`Error installing KAITO addon for ${this.clusterName}: ${kaitoInstallationResult.error}`,
);
return;
}
// install Kaito Federated Credentials and role Assignments
try {
const installKaitoFederatedCredentialsAndRoleAssignments = await longRunning(
`Installing KAITO Federated Credentials and role Assignments.`,
() => {
return this.installKaitoComponents();
},
);
if (failed(installKaitoFederatedCredentialsAndRoleAssignments)) {
//installing federated credentionals failed
const errorMessage = installKaitoFederatedCredentialsAndRoleAssignments.error;
vscode.window.showErrorMessage(
`Error installing KAITO Federated Credentials and role Assignments: ${errorMessage}`,
);
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing Federated Credentials Failed",
event: 3,
errorMessage: errorMessage,
});
return;
}
//kaito installation succeeded
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing KAITO succeeded",
event: 4,
errorMessage: undefined,
});
} catch (ex) {
vscode.window.showErrorMessage(
`Error installing KAITO Federated Credentials and role Assignments: ${getErrorMessage(ex)}`,
);
}
}
private async handleKaitoInstallationLogic(
currentJson: GenericResource,
webview: MessageSink<ToWebViewMsgDef>,
): Promise<Errorable<string> | undefined> {
// Install kaito enablement
const managedClusterSpec: ManagedCluster = {
location: currentJson.location!,
aiToolchainOperatorProfile: { enabled: true },
oidcIssuerProfile: { enabled: true },
};
try {
const poller = await longRunning("", () => {
return this.containerServiceClient.managedClusters.beginCreateOrUpdate(
this.resourceGroupName,
this.clusterName,
managedClusterSpec,
);
});
// kaito installation in progress
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing KAITO",
event: 1,
errorMessage: undefined,
});
poller.onProgress((state) => {
if (state.status === "succeeded") {
webview.postKaitoInstallProgressUpdate({
operationDescription: "KAITO Federated Credentials and role Assignments",
event: 1,
errorMessage: undefined,
});
} else if (state.status === "failed") {
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing KAITO failed",
event: 3,
errorMessage: state.error?.message,
});
}
});
await poller.pollUntilDone();
return { succeeded: true, result: "KAITO installation logic completed successfully" };
} catch (ex) {
const errorMessage = isInvalidTemplateDeploymentError(ex)
? getInvalidTemplateErrorMessage(ex)
: getErrorMessage(ex);
// Retry the operation
if (RETRY_COUNT < MAX_RETRY) {
RETRY_COUNT++;
const answer = await vscode.window.showErrorMessage(
`Error installing KAITO addon for ${this.clusterName}: ${errorMessage}`,
{ modal: true },
"Retry",
);
// Here the retry logic exist
if (answer === "Retry") {
this.handleKaitoInstallation(webview);
}
}
if (RETRY_COUNT >= MAX_RETRY) {
vscode.window.showErrorMessage(`Error installing KAITO addon for ${this.clusterName}: ${errorMessage}`);
}
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing KAITO failed",
event: 3,
errorMessage: ex instanceof Error ? ex.message : String(ex),
});
return { succeeded: false, error: ex instanceof Error ? ex.message : String(ex) };
}
}
private async installKaitoComponents(): Promise<Errorable<string>> {
const clusterInfo = await getManagedCluster(
this.sessionProvider,
this.subscriptionId,
this.resourceGroupName,
this.clusterName,
);
if (failed(clusterInfo)) {
vscode.window.showErrorMessage(`Error getting managed cluster info: ${clusterInfo.error}`);
return { succeeded: false, error: clusterInfo.error };
}
const roleAssignmentsResult = await this.installKaitoRoleAssignments(
clusterInfo.result.nodeResourceGroup!,
this.subscriptionId,
this.resourceGroupName,
this.clusterName,
);
//halt installation if role assignments creation failed
if (failed(roleAssignmentsResult)) {
return { succeeded: false, error: roleAssignmentsResult.error };
}
const aksOidcIssuerUrl = clusterInfo.result.oidcIssuerProfile?.issuerURL;
if (!aksOidcIssuerUrl) {
vscode.window.showErrorMessage(
`Error getting aks oidc issuer url, oidc issuer url is undefined/null/empty`,
);
return {
succeeded: false,
error: "Error getting aks oidc issuer url, oidc issuer url is undefined/null/empty",
};
}
const kaitoFederatedCredentialsResult = await this.installKaitoFederatedCredentials(
clusterInfo.result.nodeResourceGroup!,
this.clusterName,
aksOidcIssuerUrl,
);
//halt installation if federated credentials installation failed
if (failed(kaitoFederatedCredentialsResult)) {
return { succeeded: false, error: kaitoFederatedCredentialsResult.error };
}
//kubectl rollout restart deployment kaito-gpu-provisioner -n kube-system
const command = `rollout restart deployment kaito-gpu-provisioner -n kube-system`;
const kubectlresult = await invokeKubectlCommand(this.kubectl, this.kubeConfigFilePath, command);
if (failed(kubectlresult)) {
vscode.window.showErrorMessage(`Error restarting kaito-gpu-provisioner: ${kubectlresult.error}`);
return { succeeded: false, error: kubectlresult.error };
}
// waiting for gpu provisioner to be ready, which usually takes around 30 seconds
await new Promise((resolve) => setTimeout(resolve, 35000));
let gpuProvisionerReady = false;
const kaitoPods = await getKaitoPods(
this.sessionProvider,
this.kubectl,
this.subscriptionId,
this.resourceGroupName,
this.clusterName,
);
const gpuProvisionerPod = kaitoPods.find((pod) =>
pod.imageName.startsWith("mcr.microsoft.com/aks/kaito/gpu-provisioner"),
);
if (gpuProvisionerPod === undefined) {
vscode.window.showErrorMessage(`GPU Provisioner not found`);
return { succeeded: false, error: "GPU Provisioner not found" };
}
// If the pod is already ready, we can skip the loop
if (
await isPodReady(
gpuProvisionerPod.nameSpace,
gpuProvisionerPod.podName,
this.kubectl,
this.kubeConfigFilePath,
)
) {
gpuProvisionerReady = true;
} else {
// If the pod is not ready, we will poll readiness for the next 2 minutes
const endTime = Date.now() + 120000;
while (Date.now() < endTime) {
if (
await isPodReady(
gpuProvisionerPod.nameSpace,
gpuProvisionerPod.podName,
this.kubectl,
this.kubeConfigFilePath,
)
) {
gpuProvisionerReady = true;
break;
}
// 5 second delay between checks
await new Promise((resolve) => setTimeout(resolve, 5000));
}
}
if (!gpuProvisionerReady) {
vscode.window.showErrorMessage(`GPU Provisioner is not ready`);
return { succeeded: false, error: "GPU Provisioner is not ready" };
}
return { succeeded: true, result: "KAITO components installed successfully" };
}
private async registerKaitoFeature(webview: MessageSink<ToWebViewMsgDef>) {
// Let's start delay for 3 mins
await longRunning(`Waiting for the AIToolchainOperator registration to complete.`, async () => {
await new Promise((resolve) => setTimeout(resolve, 180000)); // 3 minutes = 180000 ms
});
// Get the feature registration state
const getFeatureClientRegisterStateAfterDelay = await longRunning(
`Getting the AIToolchainOperator registration state.`,
() => {
return this.featureClient.features.get("Microsoft.ContainerService", "AIToolchainOperatorPreview");
},
);
if (getFeatureClientRegisterStateAfterDelay.properties?.state !== "Registered") {
webview.postKaitoInstallProgressUpdate({
operationDescription: "Installing KAITO",
event: 3,
errorMessage: "Failed to register feature",
});
return;
}
}
private async installKaitoRoleAssignments(
mcResourceGroup: string,
subscriptionId: string,
resourceGroupName: string,
clusterName: string,
): Promise<Errorable<string>> {
// get principal id of managed service identity
const identityName = `ai-toolchain-operator-${clusterName}`;
const identityResult = await getIdentity(this.managedServiceIdentityClient, mcResourceGroup, identityName);
if (failed(identityResult)) {
vscode.window.showErrorMessage(`Error getting identity: ${identityResult.error}`);
return { succeeded: false, error: identityResult.error };
}
const roleAssignment = await createRoleAssignment(
this.authorizationClient,
subscriptionId,
identityResult.result.principalId!,
"b24988ac-6180-42a0-ab88-20f7382dd24c", // contributor role id: https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#general
`/subscriptions/${subscriptionId}/resourceGroups/${resourceGroupName}`,
);
if (failed(roleAssignment)) {
// Don't cancel installation if role assignments already exist, user could be attempting to reinstall
if (roleAssignment.error?.includes("already exists")) {
return { succeeded: true, result: "Role assignments already exist" };
} else {
// cancel installation only if there is an alternate error that conflicts with further steps
return { succeeded: false, error: roleAssignment.error };
}
}
return { succeeded: true, result: "Role assignments created successfully" };
}
private async installKaitoFederatedCredentials(
nodeResourceGroup: string,
clusterName: string,
aksOidcIssuerUrl: string,
): Promise<Errorable<string>> {
const result = await createFederatedCredential(
this.managedServiceIdentityClient,
nodeResourceGroup,
"kaito-federated-identity", // https://learn.microsoft.com/en-us/azure/aks/ai-toolchain-operator#establish-a-federated-identity-credential
`ai-toolchain-operator-${clusterName}`,
aksOidcIssuerUrl,
`system:serviceaccount:kube-system:kaito-gpu-provisioner`,
"api://AzureADTokenExchange",
);
if (failed(result)) {
return { succeeded: false, error: result.error };
} else {
return { succeeded: true, result: "Federated credentials created successfully" };
}
}
}
function getInvalidTemplateErrorMessage(ex: InvalidTemplateDeploymentRestError): string {
const innerDetails = ex.details.error?.details || [];
if (innerDetails.length > 0) {
const details = innerDetails.map((d) => `${d.code}: ${d.message}`).join("\n");
return `Invalid template:\n${details}`;
}
const innerError = ex.details.error?.message || "";
if (innerError) {
return `Invalid template:\n${innerError}`;
}
return `Invalid template: ${getErrorMessage(ex)}`;
}
type InvalidTemplateDeploymentRestError = RestError & {
details: {
error?: {
code: "InvalidTemplateDeployment";
message?: string;
details?: {
code?: string;
message?: string;
}[];
};
};
};
function isInvalidTemplateDeploymentError(ex: unknown): ex is InvalidTemplateDeploymentRestError {
return isRestError(ex) && ex.code === "InvalidTemplateDeployment";
}
function isRestError(ex: unknown): ex is RestError {
return typeof ex === "object" && ex !== null && ex.constructor.name === "RestError";
}