diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index dcd19a718..de1e7cb24 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -22,7 +22,6 @@ on: default: "eastus" k8s_version: type: string - default: "1.29.2" secrets: E2E_CLIENT_ID: required: true @@ -47,7 +46,8 @@ jobs: environment: e2e-test env: GO_VERSION: "1.22" - + KARPENTER_NAMESPACE: "karpenter" + GPU_PROVISIONER_NAMESPACE: "gpu-provisioner" steps: - name: Harden Runner uses: step-security/harden-runner@5c7944e73c4c2a096b17a9cb74d65b6c2bbafbde # v2.9.1 @@ -146,7 +146,11 @@ jobs: - name: create cluster shell: bash run: | - make create-aks-cluster + if [ "${{ inputs.suite }}" == "gpuprovisioner" ]; then + make create-aks-cluster + else + make create-aks-cluster-for-karpenter + fi env: AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} @@ -154,12 +158,28 @@ jobs: AZURE_LOCATION: ${{ inputs.region }} AKS_K8S_VERSION: ${{ inputs.k8s_version }} + - name: Az login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1 + with: + client-id: ${{ secrets.E2E_CLIENT_ID }} + tenant-id: ${{ secrets.E2E_TENANT_ID }} + subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} + + - name: Create Identities and Permissions for ${{ inputs.suite }} + shell: bash + run: | + make generate-identities + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + TEST_SUITE: ${{ inputs.suite }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }} + - name: Install gpu-provisioner helm chart - if: ${{ inputs.suite == 'gpuprov' }} + if: ${{ inputs.suite == 'gpuprovisioner' }} shell: bash run: | make gpu-provisioner-helm - kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s env: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} @@ -172,15 +192,13 @@ jobs: shell: bash run: | make azure-karpenter-helm - # taint nodes as karpenter-system - kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all - kubectl wait --for=condition=available deploy "karpenter" -n karpenter --timeout=300s env: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }} KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }} + KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }} - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1 with: @@ -188,29 +206,14 @@ jobs: tenant-id: ${{ secrets.E2E_TENANT_ID }} subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} - - name: Create Role Assignment - uses: azure/CLI@v2.0.0 - with: - inlineScript: | - IDENTITY_PRINCIPAL_ID="$(az identity show --name ${{ inputs.suite }}Identity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)" - az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.E2E_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor" - if [ "${{ inputs.suite }}" == "azkarpenter" ]; then - - fi - - name: Create Azure Federated Identity - uses: azure/CLI@v2.0.0 - with: - inlineScript: | - AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)" - - if [ "${{ inputs.suite }}" == "gpuprov" ]; then - az identity federated-credential create --name ${{ inputs.suite }}-fed --identity-name ${{ inputs.suite }}Identity --resource-group "${{ env.CLUSTER_NAME }}" \ - --issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange - fi - if [ "${{ inputs.suite }}" == "azkarpenter" ]; then - az identity federated-credential create --name ${{ inputs.suite }}-fed --identity-name ${{ inputs.suite }}Identity --resource-group "${{ env.CLUSTER_NAME }}" \ - --issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"karpenter:karpenter-sa" --audience api://AzureADTokenExchange - fi + - name: build KAITO image + if: ${{ !inputs.isRelease }} + shell: bash + run: | + make docker-build-kaito + env: + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} - name: Install KAITO Workspace helm chart shell: bash @@ -222,6 +225,7 @@ jobs: AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} REGISTRY: ${{ env.REGISTRY }} VERSION: ${{ env.VERSION }} + TEST_SUITE: ${{ inputs.suite }} # Retrieve E2E ACR credentials and create Kubernetes secret - name: Set up E2E ACR Credentials and Secret @@ -251,6 +255,14 @@ jobs: --docker-username=${{ secrets.E2E_ACR_AMRT_USERNAME }} \ --docker-password=${{ secrets.E2E_ACR_AMRT_PASSWORD }} + - name: Log ${{ inputs.suite }} + run: | + if [ "${{ inputs.suite }}" == "gpuprovisioner" ]; then + kubectl logs -n "${{ env.GPU_PROVISIONER_NAMESPACE }}" -l app.kubernetes.io/name=gpu-provisioner -c controller + else + kubectl logs -n "${{ env.KARPENTER_NAMESPACE }}" -l app.kubernetes.io/name=karpenter -c controller + fi + - name: Log kaito-workspace run: | kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {} @@ -264,6 +276,7 @@ jobs: REGISTRY: ${{ env.REGISTRY }} AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }} + TEST_SUITE: ${{ inputs.suite }} E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret diff --git a/.github/workflows/kaito-e2e.yml b/.github/workflows/kaito-e2e.yml index a3be7131e..a15415d4b 100644 --- a/.github/workflows/kaito-e2e.yml +++ b/.github/workflows/kaito-e2e.yml @@ -1,5 +1,9 @@ name: pr-e2e-test +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: pull_request: paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] @@ -16,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [ gpuprov, azkarpenter ] + suite: [ gpuprovisioner, azkarpenter ] permissions: contents: read id-token: write @@ -24,6 +28,7 @@ jobs: uses: ./.github/workflows/e2e-workflow.yml with: git_sha: ${{ github.event.pull_request.head.sha }} + k8s_version: ${{ vars.AKS_K8S_VERSION }} suite: ${{ matrix.suite }} secrets: E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} diff --git a/.github/workflows/publish-gh-image.yml b/.github/workflows/publish-gh-image.yml index 993392d80..84de42379 100644 --- a/.github/workflows/publish-gh-image.yml +++ b/.github/workflows/publish-gh-image.yml @@ -121,6 +121,7 @@ jobs: git_sha: ${{ github.sha }} isRelease: true registry: ${{ needs.build-scan-publish-gh-images.outputs.registry_repository }} + k8s_version: ${{ vars.AKS_K8S_VERSION }} tag: ${{ needs.check-tag.outputs.tag }} secrets: E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} diff --git a/.github/workflows/publish-mcr-image.yml b/.github/workflows/publish-mcr-image.yml index 44c9cf2a0..5e1568e39 100644 --- a/.github/workflows/publish-mcr-image.yml +++ b/.github/workflows/publish-mcr-image.yml @@ -57,6 +57,7 @@ jobs: git_sha: ${{ github.sha }} isRelease: true registry: "mcr.microsoft.com/aks/kaito" + k8s_version: ${{ vars.AKS_K8S_VERSION }} tag: ${{ github.event.client_payload.tag }} secrets: E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} diff --git a/Makefile b/Makefile index 1fa1305ec..e19851be9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Image URL to use all building/pushing image targets -REGISTRY ?= YOUR_REGISTRY +REGISTRY ?= hkkarpenter.azurecr.io IMG_NAME ?= workspace VERSION ?= v0.3.0 GPU_PROVISIONER_VERSION ?= 0.2.0 @@ -17,30 +17,28 @@ GOLANGCI_LINT_BIN := golangci-lint GOLANGCI_LINT := $(abspath $(TOOLS_BIN_DIR)/$(GOLANGCI_LINT_BIN)-$(GOLANGCI_LINT_VER)) E2E_TEST_BIN := e2e.test -KARPENTER_E2E_TEST_BIN := karpenter-e2e.test E2E_TEST := $(BIN_DIR)/$(E2E_TEST_BIN) -KARPENTER_E2E_TEST := $(BIN_DIR)/$(KARPENTER_E2E_TEST_BIN) GINKGO_VER := v2.19.0 GINKGO_BIN := ginkgo GINKGO := $(TOOLS_BIN_DIR)/$(GINKGO_BIN)-$(GINKGO_VER) +TEST_SUITE ?= gpuprovisioner AZURE_SUBSCRIPTION_ID ?= $(AZURE_SUBSCRIPTION_ID) AZURE_LOCATION ?= eastus -AKS_K8S_VERSION ?= 1.29.2 +AKS_K8S_VERSION ?= 1.30.0 AZURE_RESOURCE_GROUP ?= demo -AZURE_CLUSTER_NAME ?= kaito-demo +AZURE_CLUSTER_NAME ?= kaito-demo-heba AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION) GPU_PROVISIONER_NAMESPACE ?= gpu-provisioner KAITO_NAMESPACE ?= kaito-workspace -GPU_PROVISIONER_MSI_NAME ?= gpuIdentity +GPU_PROVISIONER_MSI_NAME ?= gpuprovisionerIdentity -## Karpenter parameters +## Azure Karpenter parameters KARPENTER_NAMESPACE ?= karpenter -KARPENTER_SERVICE_ACCOUNT_NAME ?= karpenter-sa -KARPENTER_VERSION ?= 0.4.0 -AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME ?= karpenterIdentity -KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME ?= karpenter-fed +KARPENTER_SA_NAME ?= karpenter-sa +KARPENTER_VERSION ?= 0.5.1 +AZURE_KARPENTER_MSI_NAME ?= azkarpenterIdentity RUN_LLAMA_13B ?= false AI_MODELS_REGISTRY ?= modelregistry.azurecr.io @@ -88,10 +86,6 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." -.PHONY: fmt -fmt: ## Run go fmt against code. - go fmt ./... - ## -------------------------------------- ## Unit Tests ## -------------------------------------- @@ -124,19 +118,11 @@ $(E2E_TEST): .PHONY: kaito-workspace-e2e-test kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO) AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ - AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_NAMESPACE=$(GPU_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \ + AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_PROVISIONER_NAMESPACE=$(GPU_PROVISIONER_NAMESPACE) \ + KARPENTER_NAMESPACE=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) TEST_SUITE=$(TEST_SUITE) \ SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \ $(GINKGO) -v -trace $(GINKGO_ARGS) $(E2E_TEST) -$(KARPENTER_E2E_TEST): - (cd test/e2e/karpenter && go test -c . -o $(KARPENTER_E2E_TEST)) - -.PHONY: kaito-karpenter-e2e-test -kaito-karpenter-e2e-test: $(E2E_TEST) $(GINKGO) - AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ - AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) KARPENTER=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \ - $(GINKGO) -v -trace $(GINKGO_ARGS) $(KARPENTER_E2E_TEST) - ## -------------------------------------- ## Azure resources ## -------------------------------------- @@ -169,17 +155,15 @@ create-aks-cluster-with-kaito: ## Create test AKS cluster (with msi, oidc and ka .PHONY: create-aks-cluster-for-karpenter create-aks-cluster-for-karpenter: ## Create test AKS cluster (with msi, cilium, oidc, and workload identity enabled) az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) \ - --location $(AZURE_LOCATION) --attach-acr $(AZURE_ACR_NAME) \ - --kubernetes-version $(AKS_K8S_VERSION) --node-count 1 --generate-ssh-keys \ + --location $(AZURE_LOCATION) --attach-acr $(AZURE_ACR_NAME) --node-vm-size "Standard_D2s_v3" \ + --kubernetes-version $(AKS_K8S_VERSION) --node-count 3 --generate-ssh-keys \ --network-plugin azure --network-plugin-mode overlay --network-dataplane cilium \ --enable-managed-identity --enable-oidc-issuer --enable-workload-identity -o none az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --overwrite-existing - ## -------------------------------------- ## Image Docker Build ## -------------------------------------- - BUILDX_BUILDER_NAME ?= img-builder OUTPUT_TYPE ?= type=registry QEMU_VERSION ?= 7.2.0-1 @@ -202,6 +186,15 @@ docker-build-kaito: docker-buildx --pull \ --tag $(REGISTRY)/$(IMG_NAME):$(IMG_TAG) . +.PHONY: docker-build-adapter +docker-build-adapter: docker-buildx + docker buildx build \ + --file ./docker/adapter/Dockerfile \ + --output=$(OUTPUT_TYPE) \ + --platform="linux/$(ARCH)" \ + --pull \ + --tag $(REGISTRY)/e2e-adapter:0.0.1 . + .PHONY: docker-build-dataset docker-build-dataset: docker-buildx docker buildx build \ @@ -230,54 +223,40 @@ az-patch-install-helm: ## Update Azure client env vars and settings in helm valu yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/workspace/values.yaml yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/workspace/values.yaml + if [ $(TEST_SUITE) = "azkarpenter" ]; then \ + yq -i '(.featureGates.Karpenter) = "true"' ./charts/kaito/workspace/values.yaml; \ + fi + yq -i '(.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/workspace/values.yaml helm install kaito-workspace ./charts/kaito/workspace --namespace $(KAITO_NAMESPACE) --create-namespace +generate-identities: ## Create identities for the provisioner component. + ./hack/deploy/generate-identities.sh \ + $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(TEST_SUITE) $(AZURE_SUBSCRIPTION_ID) + ## -------------------------------------- ## gpu-provider installation ## -------------------------------------- -gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner - az identity create --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - - IDENTITY_PRINCIPAL_ID=$(shell az identity show --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId');\ - az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor" - - AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ - az identity federated-credential create --name gpu-federatecredential --identity-name $(GPU_PROVISIONER_MSI_NAME) --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ - --subject system:serviceaccount:"$(GPU_PROVISIONER_NAMESPACE):$(GPU_PROVISIONER_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) - .PHONY: gpu-provisioner-helm gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh - chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME) + chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) \ + $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME) - helm install $(GPU_PROVISIONER_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \ + helm install gpu-provisioner \ + --values gpu-provisioner-values.yaml \ + --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) \ https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz + kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s ## -------------------------------------- ## Azure Karpenter Installation ## -------------------------------------- -karpenter-identity-perm: - az identity create --name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - - KARPENTER_USER_ASSIGNED_PRINCIPAL_ID=$(shell az identity show -n "$(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)" \ - -g "$(AZURE_RESOURCE_GROUP)" --query 'principalId');\ - az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Virtual Machine Contributor";\ - az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Network Contributor";\ - az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Managed Identity Operator" - - AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ - az identity federated-credential create --name $(KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME) \ - --identity-name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) \ - --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ - --subject system:serviceaccount:"$(KARPENTER_NAMESPACE):$(KARPENTER_SERVICE_ACCOUNT_NAME)" \ - --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) - .PHONY: azure-karpenter-helm azure-karpenter-helm: ## Update Azure client env vars and settings in helm values.yml curl -sO https://raw.githubusercontent.com/Azure/karpenter-provider-azure/main/hack/deploy/configure-values.sh - chmod +x ./configure-values.sh && ./configure-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) \ - $(KARPENTER_SERVICE_ACCOUNT_NAME) $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) + chmod +x ./configure-values.sh && ./configure-values.sh $(AZURE_CLUSTER_NAME) \ + $(AZURE_RESOURCE_GROUP) $(KARPENTER_SA_NAME) $(AZURE_KARPENTER_MSI_NAME) helm upgrade --install karpenter oci://mcr.microsoft.com/aks/karpenter/karpenter \ --version "$(KARPENTER_VERSION)" \ @@ -286,20 +265,9 @@ azure-karpenter-helm: ## Update Azure client env vars and settings in helm valu --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ - --set controller.resources.limits.memory=1Gi \ - --wait - - kubectl logs -f -n "$(KARPENTER_NAMESPACE)" -l app.kubernetes.io/name=karpenter -c controller - -##@ Development -.PHONY: manifests -manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases - -.PHONY: generate -generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. - $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + --set controller.resources.limits.memory=1Gi + kubectl wait --for=condition=available deploy "karpenter" -n karpenter --timeout=300s ##@ Build .PHONY: build diff --git a/charts/kaito/workspace/templates/clusterrole.yaml b/charts/kaito/workspace/templates/clusterrole.yaml index 5e24c46e1..0684129cf 100644 --- a/charts/kaito/workspace/templates/clusterrole.yaml +++ b/charts/kaito/workspace/templates/clusterrole.yaml @@ -39,6 +39,12 @@ rules: - apiGroups: ["karpenter.sh"] resources: ["machines", "machines/status", "nodeclaims", "nodeclaims/status"] verbs: ["get","list","watch","create", "delete", "update", "patch"] + - apiGroups: [ "karpenter.azure.com" ] + resources: [ "aksnodeclasses"] + verbs: [ "get","list","watch","create", "delete", "update", "patch" ] + - apiGroups: [ "karpenter.k8s.aws" ] + resources: [ "ec2nodeclasses" ] + verbs: [ "get","list","watch","create", "delete", "update", "patch" ] - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["get","list","watch"] diff --git a/charts/kaito/workspace/templates/deployment.yaml b/charts/kaito/workspace/templates/deployment.yaml index 0297530e8..92989f6a9 100644 --- a/charts/kaito/workspace/templates/deployment.yaml +++ b/charts/kaito/workspace/templates/deployment.yaml @@ -47,6 +47,8 @@ spec: value: {{ .Values.presetRegistryName }} - name: CLOUD_PROVIDER value: {{ .Values.cloudProviderName }} + - name: CLUSTER_NAME + value: {{ .Values.clusterName }} ports: - name: http-metrics containerPort: 8080 diff --git a/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml b/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml index 850eb4562..e7d9e2588 100644 --- a/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml +++ b/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml @@ -27,10 +27,6 @@ spec: operator: NotIn values: - virtual-kubelet - - key: karpenter.sh/provisioner-name - operator: Exists - - key: kaito.sh/machine-type - operator: Exists tolerations: # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. # This, along with the annotation above marks this pod as a critical add-on. diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 2cef7f75c..d5cf0d77e 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -32,3 +32,4 @@ tolerations: [] affinity: {} # Values can be "azure" or "aws" cloudProviderName: "azure" +clusterName: "" diff --git a/cmd/main.go b/cmd/main.go index e4b062626..06ddc37ab 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -11,11 +11,11 @@ import ( "syscall" "time" + azurev1alpha2 "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + awsv1beta1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" "github.com/azure/kaito/pkg/featuregates" "github.com/azure/kaito/pkg/k8sclient" "github.com/azure/kaito/pkg/nodeclaim" - "github.com/azure/kaito/pkg/utils/consts" - "sigs.k8s.io/controller-runtime/pkg/client" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/karpenter/pkg/apis/v1beta1" @@ -24,7 +24,6 @@ import ( "github.com/azure/kaito/pkg/webhooks" "k8s.io/klog/v2" "knative.dev/pkg/injection/sharedmain" - "knative.dev/pkg/signals" "knative.dev/pkg/webhook" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -59,10 +58,12 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(kaitov1alpha1.AddToScheme(scheme)) utilruntime.Must(v1alpha5.SchemeBuilder.AddToScheme(scheme)) utilruntime.Must(v1beta1.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(azurev1alpha2.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(awsv1beta1.SchemeBuilder.AddToScheme(scheme)) + //+kubebuilder:scaffold:scheme klog.InitFlags(nil) } @@ -89,6 +90,8 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + ctx := withShutdownSignal(context.Background()) + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ @@ -117,8 +120,12 @@ func main() { k8sclient.SetGlobalClient(mgr.GetClient()) kClient := k8sclient.GetGlobalClient() - workspaceReconciler := controllers.NewWorkspaceReconciler(k8sclient.GetGlobalClient(), - mgr.GetScheme(), log.Log.WithName("controllers").WithName("Workspace"), mgr.GetEventRecorderFor("KAITO-Workspace-controller")) + workspaceReconciler := controllers.NewWorkspaceReconciler( + kClient, + mgr.GetScheme(), + log.Log.WithName("controllers").WithName("Workspace"), + mgr.GetEventRecorderFor("KAITO-Workspace-controller"), + ) if err = workspaceReconciler.SetupWithManager(mgr); err != nil { klog.ErrorS(err, "unable to create controller", "controller", "Workspace") @@ -142,7 +149,7 @@ func main() { klog.ErrorS(err, "unable to parse the webhook port number") exitWithErrorFunc() } - ctx := webhook.WithOptions(signals.NewContext(), webhook.Options{ + ctx := webhook.WithOptions(ctx, webhook.Options{ ServiceName: os.Getenv(WebhookServiceName), Port: p, SecretName: "workspace-webhook-cert", @@ -154,31 +161,22 @@ func main() { // wait 2 seconds to allow reconciling webhookconfiguration and service endpoint. time.Sleep(2 * time.Second) - if err = featuregates.ParseAndValidateFeatureGates(featureGates); err != nil { + if err := featuregates.ParseAndValidateFeatureGates(featureGates); err != nil { klog.ErrorS(err, "unable to set `feature-gates` flag") exitWithErrorFunc() } } + err = nodeclaim.CheckNodeClass(ctx, kClient) + if err != nil { + exitWithErrorFunc() + } + klog.InfoS("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { klog.ErrorS(err, "problem running manager") exitWithErrorFunc() } - ctx := withShutdownSignal(context.Background()) - - // check if Karpenter NodeClass is available. If not, the controller will create it automatically. - if featuregates.FeatureGates[consts.FeatureFlagKarpenter] { - cloud := GetCloudProviderName() - if !nodeclaim.IsNodeClassAvailable(ctx, cloud, kClient) { - klog.Infof("NodeClass is not available, creating NodeClass") - if err := nodeclaim.CreateKarpenterNodeClass(ctx, kClient); err != nil { - if client.IgnoreAlreadyExists(err) != nil { - exitWithErrorFunc() - } - } - } - } } // withShutdownSignal returns a copy of the parent context that will close if @@ -196,13 +194,3 @@ func withShutdownSignal(ctx context.Context) context.Context { }() return nctx } - -// GetCloudProviderName returns the cloud provider name from the environment variable. -// If the environment variable is not set, the controller will exit with an error. -func GetCloudProviderName() string { - cloudProvider := os.Getenv("CLOUD_PROVIDER") - if cloudProvider == "" { - exitWithErrorFunc() - } - return cloudProvider -} diff --git a/hack/deploy/generate-identities.sh b/hack/deploy/generate-identities.sh index 98d1aa20f..5720eff97 100755 --- a/hack/deploy/generate-identities.sh +++ b/hack/deploy/generate-identities.sh @@ -37,10 +37,10 @@ echo "IDENTITY_JSON: $IDENTITY_JSON" IDENTITY_PRINCIPAL_ID=$(jq -r '.principalId' <<< "$IDENTITY_JSON") -AZURE_RESOURCE_GROUP_RESOURCE_ID="/subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$AZURE_RESOURCE_GROUP" +AZURE_RESOURCE_GROUP_RESOURCE_ID=$(az group show --name "${AZURE_RESOURCE_GROUP}" --query "id" -otsv) AZURE_RESOURCE_GROUP_MC=$(jq -r ".nodeResourceGroup" <<< "$AKS_JSON") -AZURE_RESOURCE_GROUP_MC_RESOURCE_ID="/subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$AZURE_RESOURCE_GROUP_MC" +AZURE_RESOURCE_GROUP_MC_RESOURCE_ID=$(az group show --name "${AZURE_RESOURCE_GROUP_MC}" --query "id" -otsv) sleep 40 ## wait for the identity credential to be created @@ -53,13 +53,13 @@ az identity federated-credential create --name "${FED_NAME}" \ if [[ "${COMPONENT_NAME}" == "azkarpenter" ]]; then echo "Creating role assignments for $COMPONENT_NAME ..." - for role in "Virtual Machine Contributor" "Network Contributor" "Managed Identity Operator" "Contributor"; do + for role in "Virtual Machine Contributor" "Network Contributor" "Managed Identity Operator"; do az role assignment create --assignee "$IDENTITY_PRINCIPAL_ID" \ --scope "$AZURE_RESOURCE_GROUP_MC_RESOURCE_ID" \ --role "$role" - az role assignment create --assignee "$IDENTITY_PRINCIPAL_ID" \ - --scope "$AZURE_RESOURCE_GROUP_RESOURCE_ID" \ - --role "$role" +# az role assignment create --assignee "$IDENTITY_PRINCIPAL_ID" \ +# --scope "$AZURE_RESOURCE_GROUP_RESOURCE_ID" \ +# --role "$role" done else echo "Creating role assignments for $COMPONENT_NAME ..." diff --git a/pkg/nodeclaim/nodeclaim.go b/pkg/nodeclaim/nodeclaim.go index 19a00c58e..f751ce306 100644 --- a/pkg/nodeclaim/nodeclaim.go +++ b/pkg/nodeclaim/nodeclaim.go @@ -7,6 +7,7 @@ import ( "context" "crypto/sha256" "encoding/hex" + "errors" "fmt" "os" "time" @@ -14,7 +15,6 @@ import ( azurev1alpha2 "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" awsv1beta1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" - "github.com/azure/kaito/pkg/featuregates" "github.com/azure/kaito/pkg/utils/consts" "github.com/samber/lo" v1 "k8s.io/api/core/v1" @@ -190,13 +190,12 @@ func CreateNodeClaim(ctx context.Context, nodeClaimObj *v1beta1.NodeClaim, kubeC return retry.OnError(retry.DefaultBackoff, func(err error) bool { return err.Error() != ErrorInstanceTypesUnavailable }, func() error { - if featuregates.FeatureGates[consts.FeatureFlagKarpenter] { - err := CreateKarpenterNodeClass(ctx, kubeClient) - if err != nil { - return err - } + err := CheckNodeClass(ctx, kubeClient) + if err != nil { + return err } - err := kubeClient.Create(ctx, nodeClaimObj, &client.CreateOptions{}) + + err = kubeClient.Create(ctx, nodeClaimObj, &client.CreateOptions{}) if err != nil { return err } @@ -221,6 +220,7 @@ func CreateNodeClaim(ctx context.Context, nodeClaimObj *v1beta1.NodeClaim, kubeC // CreateKarpenterNodeClass creates a nodeClass object for Karpenter. func CreateKarpenterNodeClass(ctx context.Context, kubeClient client.Client) error { cloudName := os.Getenv("CLOUD_PROVIDER") + klog.InfoS("CreateKarpenterNodeClass", "cloudName", cloudName) if cloudName == consts.AzureCloudName { nodeClassObj := GenerateAKSNodeClassManifest(ctx) @@ -334,3 +334,21 @@ func IsNodeClassAvailable(ctx context.Context, cloudName string, kubeClient clie klog.Error("unsupported cloud provider ", cloudName) return false } + +// CheckNodeClass checks if Karpenter NodeClass is available. If not, the controller will create it automatically. +// This is only applicable when Karpenter feature flag is enabled. +func CheckNodeClass(ctx context.Context, kClient client.Client) error { + cloudProvider := os.Getenv("CLOUD_PROVIDER") + if cloudProvider == "" { + return errors.New("CLOUD_PROVIDER environment variable cannot be empty") + } + if !IsNodeClassAvailable(ctx, cloudProvider, kClient) { + klog.Infof("NodeClass is not available, creating NodeClass") + if err := CreateKarpenterNodeClass(ctx, kClient); + err != nil && client.IgnoreAlreadyExists(err) != nil { + klog.ErrorS(err, "unable to create NodeClass") + return errors.New("error while creating NodeClass") + } + } + return nil +} diff --git a/pkg/utils/common.go b/pkg/utils/common.go index 120619d09..94209bf83 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -6,17 +6,18 @@ package utils import ( "context" "fmt" + "io/ioutil" + "os" + "strings" + "github.com/azure/kaito/pkg/sku" "github.com/azure/kaito/pkg/utils/consts" "gopkg.in/yaml.v2" - "io/ioutil" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "knative.dev/pkg/apis" - "os" "sigs.k8s.io/controller-runtime/pkg/client" - "strings" ) func Contains(s []string, e string) bool { diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index acf518e90..cff242bb1 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -21,28 +21,51 @@ import ( var ( ctx = context.Background() - namespaceName = fmt.Sprint(E2eNamespace, rand.Intn(100)) + namespaceName = fmt.Sprint(utils.E2eNamespace, rand.Intn(100)) + suiteTestName = os.Getenv("TEST_SUITE") ) var _ = SynchronizedBeforeSuite(func() []byte { - GetClusterClient(TestingCluster) - gpuNamespace := os.Getenv("GPU_NAMESPACE") + utils.GetClusterClient(utils.TestingCluster) kaitoNamespace := os.Getenv("KAITO_NAMESPACE") - //check gpu-provisioner deployment is up and running - gpuProvisionerDeployment := &v1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-provisioner", - Namespace: gpuNamespace, - }, + if suiteTestName == "azkarpenter" { + karpenterNamespace := os.Getenv("KARPENTER_NAMESPACE") + //check karpenter deployment is up and running + karpenterDeployment := &v1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "karpenter", + Namespace: karpenterNamespace, + }, + } + + Eventually(func() error { + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: karpenterDeployment.Namespace, + Name: karpenterDeployment.Name, + }, karpenterDeployment, &client.GetOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to wait for karpenter deployment") } - Eventually(func() error { - return TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ - Namespace: gpuProvisionerDeployment.Namespace, - Name: gpuProvisionerDeployment.Name, - }, gpuProvisionerDeployment, &client.GetOptions{}) - }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to wait for gpu-provisioner deployment") + if suiteTestName == "gpuprovisioner" { + gpuNamespace := os.Getenv("GPU_PROVISIONER_NAMESPACE") + //check gpu-provisioner deployment is up and running + gpuProvisionerDeployment := &v1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-provisioner", + Namespace: gpuNamespace, + }, + } + + Eventually(func() error { + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: gpuProvisionerDeployment.Namespace, + Name: gpuProvisionerDeployment.Name, + }, gpuProvisionerDeployment, &client.GetOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to wait for gpu-provisioner deployment") + } //check kaito-workspace deployment is up and running kaitoWorkspaceDeployment := &v1.Deployment{ @@ -53,14 +76,14 @@ var _ = SynchronizedBeforeSuite(func() []byte { } Eventually(func() error { - return TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: kaitoWorkspaceDeployment.Namespace, Name: kaitoWorkspaceDeployment.Name, }, kaitoWorkspaceDeployment, &client.GetOptions{}) }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to wait for kaito-workspace deployment") // create testing namespace - err := TestingCluster.KubeClient.Create(context.TODO(), &corev1.Namespace{ + err := utils.TestingCluster.KubeClient.Create(context.TODO(), &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: namespaceName, }, @@ -73,7 +96,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { var _ = SynchronizedAfterSuite(func() { // delete testing namespace Eventually(func() error { - return TestingCluster.KubeClient.Delete(ctx, &corev1.Namespace{ + return utils.TestingCluster.KubeClient.Delete(ctx, &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: namespaceName, }, diff --git a/test/e2e/inference_with_adapters.go b/test/e2e/inference_with_adapters_test.go similarity index 92% rename from test/e2e/inference_with_adapters.go rename to test/e2e/inference_with_adapters_test.go index 38e8f2094..bb620e530 100644 --- a/test/e2e/inference_with_adapters.go +++ b/test/e2e/inference_with_adapters_test.go @@ -50,7 +50,7 @@ func validateAdapters(workspaceObj *kaitov1alpha1.Workspace, expectedInitContain Namespace: workspaceObj.Namespace, }, } - err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, dep) @@ -95,7 +95,11 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) diff --git a/test/e2e/preset_test.go b/test/e2e/preset_test.go index 58eecba59..a85ab56a9 100644 --- a/test/e2e/preset_test.go +++ b/test/e2e/preset_test.go @@ -12,7 +12,6 @@ import ( "strings" "time" - "github.com/aws/karpenter-core/pkg/apis/v1alpha5" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/azure/kaito/test/e2e/utils" . "github.com/onsi/ginkgo/v2" @@ -23,8 +22,6 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "knative.dev/pkg/apis" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -33,10 +30,8 @@ const ( PresetLlama2BChat = "llama-2-13b-chat" PresetFalcon7BModel = "falcon-7b" PresetFalcon40BModel = "falcon-40b" - PresetMistral7BModel = "mistral-7b" PresetMistral7BInstructModel = "mistral-7b-instruct" PresetPhi2Model = "phi-2" - PresetPhi3Mini4kModel = "phi-3-mini-4k-instruct" PresetPhi3Mini128kModel = "phi-3-mini-128k-instruct" ) @@ -202,12 +197,12 @@ func createCustomTuningConfigMapForE2E() *v1.ConfigMap { func createAndValidateConfigMap(configMap *v1.ConfigMap) { By("Creating ConfigMap", func() { Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, configMap, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, configMap, &client.CreateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(Succeed(), "Failed to create ConfigMap %s", configMap.Name) By("Validating ConfigMap creation", func() { - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: configMap.Namespace, Name: configMap.Name, }, configMap, &client.GetOptions{}) @@ -237,12 +232,12 @@ func createPhi3TuningWorkspaceWithPresetPublicMode(configMapName string, numOfNo func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) { By("Creating workspace", func() { Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(Succeed(), "Failed to create workspace %s", workspaceObj.Name) By("Validating workspace creation", func() { - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, workspaceObj, &client.GetOptions{}) @@ -256,7 +251,7 @@ func copySecretToNamespace(secretName, targetNamespace string) error { originalSecret := &v1.Secret{} // Fetch the original secret from the default namespace - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: originalNamespace, Name: secretName, }, originalSecret) @@ -268,7 +263,7 @@ func copySecretToNamespace(secretName, targetNamespace string) error { newSecret := utils.CopySecret(originalSecret, targetNamespace) // Create the new secret in the target namespace - err = TestingCluster.KubeClient.Create(ctx, newSecret) + err = utils.TestingCluster.KubeClient.Create(ctx, newSecret) if err != nil { return fmt.Errorf("failed to create secret %s in namespace %s: %v", secretName, targetNamespace, err) } @@ -276,53 +271,11 @@ func copySecretToNamespace(secretName, targetNamespace string) error { return nil } -func getAllValidMachines(workspaceObj *kaitov1alpha1.Workspace) (*v1alpha5.MachineList, error) { - machineList := &v1alpha5.MachineList{} - ls := labels.Set{ - kaitov1alpha1.LabelWorkspaceName: workspaceObj.Name, - kaitov1alpha1.LabelWorkspaceNamespace: workspaceObj.Namespace, - } - - err := TestingCluster.KubeClient.List(ctx, machineList, &client.MatchingLabelsSelector{Selector: ls.AsSelector()}) - if err != nil { - return nil, err - } - return machineList, nil -} - -// Logic to validate machine creation -func validateMachineCreation(workspaceObj *kaitov1alpha1.Workspace, expectedCount int) { - By("Checking machine created by the workspace CR", func() { - Eventually(func() bool { - machineList, err := getAllValidMachines(workspaceObj) - if err != nil { - fmt.Printf("Failed to get all valid machines: %v\n", err) - return false - } - - if len(machineList.Items) != expectedCount { - fmt.Printf("Expected %d machines, but found %d machines\n", expectedCount, len(machineList.Items)) - return false - } - - for _, machine := range machineList.Items { - _, conditionFound := lo.Find(machine.GetConditions(), func(condition apis.Condition) bool { - return condition.Type == apis.ConditionReady && condition.Status == v1.ConditionTrue - }) - if !conditionFound { - return false - } - } - return true - }, 20*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for machine to be ready") - }) -} - // Logic to validate resource status func validateResourceStatus(workspaceObj *kaitov1alpha1.Workspace) { By("Checking the resource status", func() { Eventually(func() bool { - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, workspaceObj, &client.GetOptions{}) @@ -348,7 +301,7 @@ func validateAssociatedService(workspaceObj *kaitov1alpha1.Workspace) { service := &v1.Service{} Eventually(func() bool { - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: serviceNamespace, Name: serviceName, }, service) @@ -382,7 +335,7 @@ func validateInferenceResource(workspaceObj *kaitov1alpha1.Workspace, expectedRe Namespace: workspaceObj.Namespace, }, } - err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, sts) @@ -395,7 +348,7 @@ func validateInferenceResource(workspaceObj *kaitov1alpha1.Workspace, expectedRe Namespace: workspaceObj.Namespace, }, } - err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, dep) @@ -430,7 +383,7 @@ func validateTuningResource(workspaceObj *kaitov1alpha1.Workspace) { Namespace: workspaceObj.Namespace, }, } - err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, job) @@ -489,7 +442,7 @@ func validateACRTuningResultsUploaded(workspaceObj *kaitov1alpha1.Workspace, job func validateWorkspaceReadiness(workspaceObj *kaitov1alpha1.Workspace) { By("Checking the workspace status is ready", func() { Eventually(func() bool { - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, workspaceObj, &client.GetOptions{}) @@ -519,7 +472,7 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error { By("Deleting workspace", func() { Eventually(func() error { // Check if the workspace exists - err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ Namespace: workspaceObj.Namespace, Name: workspaceObj.Name, }, workspaceObj) @@ -532,7 +485,7 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error { return fmt.Errorf("error checking if workspace %s exists: %v", workspaceObj.Name, err) } - err = TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) + err = utils.TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) if err != nil { return fmt.Errorf("failed to delete workspace %s: %v", workspaceObj.Name, err) } @@ -554,7 +507,6 @@ var azureClusterName string var _ = Describe("Workspace Preset", func() { BeforeEach(func() { loadTestEnvVars() - loadModelVersions() }) @@ -574,7 +526,11 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -593,7 +549,12 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } + validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -612,7 +573,12 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } + validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -635,7 +601,12 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } + validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -661,7 +632,13 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } + validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -681,7 +658,13 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } + validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -698,7 +681,11 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) @@ -722,7 +709,11 @@ var _ = Describe("Workspace Preset", func() { defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) - validateMachineCreation(workspaceObj, numOfNode) + if suiteTestName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } validateResourceStatus(workspaceObj) time.Sleep(30 * time.Second) diff --git a/test/e2e/cluster.go b/test/e2e/utils/cluster.go similarity index 80% rename from test/e2e/cluster.go rename to test/e2e/utils/cluster.go index 7c3d0a2b4..e483da756 100644 --- a/test/e2e/cluster.go +++ b/test/e2e/utils/cluster.go @@ -1,10 +1,12 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -package e2e +package utils import ( + azurev1alpha2 "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + awsv1beta1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/runtime" @@ -14,6 +16,7 @@ import ( "k8s.io/kubernetes/test/e2e/framework" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/karpenter/pkg/apis/v1beta1" ) const ( @@ -43,6 +46,9 @@ func GetClusterClient(cluster *Cluster) { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(kaitov1alpha1.AddToScheme(scheme)) utilruntime.Must(v1alpha5.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(v1beta1.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(azurev1alpha2.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(awsv1beta1.SchemeBuilder.AddToScheme(scheme)) restConfig := config.GetConfigOrDie() diff --git a/test/e2e/utils/machine.go b/test/e2e/utils/machine.go new file mode 100644 index 000000000..96c908e05 --- /dev/null +++ b/test/e2e/utils/machine.go @@ -0,0 +1,58 @@ +package utils + +import ( + "context" + "fmt" + "time" + + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/azure/kaito/api/v1alpha1" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/samber/lo" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ValidateMachineCreation Logic to validate machine creation +func ValidateMachineCreation(ctx context.Context, workspaceObj *v1alpha1.Workspace, expectedCount int) { + ginkgo.By("Checking machine created by the workspace CR", func() { + gomega.Eventually(func() bool { + machineList, err := getAllValidMachines(ctx, workspaceObj) + if err != nil { + fmt.Printf("Failed to get all valid machines: %v", err) + return false + } + + if len(machineList.Items) != expectedCount { + return false + } + + for _, machine := range machineList.Items { + _, conditionFound := lo.Find(machine.GetConditions(), func(condition apis.Condition) bool { + return condition.Type == apis.ConditionReady && condition.Status == v1.ConditionTrue + }) + if !conditionFound { + return false + } + } + return true + }, 20*time.Minute, PollInterval).Should(gomega.BeTrue(), "Failed to wait for machine to be ready") + }) +} + +func getAllValidMachines(ctx context.Context, workspaceObj *v1alpha1.Workspace) (*v1alpha5.MachineList, error) { + machineList := &v1alpha5.MachineList{} + ls := labels.Set{ + v1alpha1.LabelWorkspaceName: workspaceObj.Name, + v1alpha1.LabelWorkspaceNamespace: workspaceObj.Namespace, + } + + err := TestingCluster.KubeClient.List(ctx, machineList, &client.MatchingLabelsSelector{Selector: ls.AsSelector()}) + if err != nil { + return nil, err + } + return machineList, nil +} diff --git a/test/e2e/utils/nodeclaim.go b/test/e2e/utils/nodeclaim.go new file mode 100644 index 000000000..8bd6beb3b --- /dev/null +++ b/test/e2e/utils/nodeclaim.go @@ -0,0 +1,59 @@ +package utils + +import ( + "context" + "fmt" + "time" + + "github.com/azure/kaito/api/v1alpha1" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/samber/lo" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/karpenter/pkg/apis/v1beta1" +) + +// ValidateNodeClaimCreation Logic to validate the nodeClaim creation. +func ValidateNodeClaimCreation(ctx context.Context, workspaceObj *v1alpha1.Workspace, expectedCount int) { + ginkgo.By("Checking nodeClaim created by the workspace CR", func() { + gomega.Eventually(func() bool { + nodeClaimList, err := GetAllValidNodeClaims(ctx, workspaceObj) + if err != nil { + fmt.Printf("Failed to get all valid nodeClaim: %v", err) + return false + } + + if len(nodeClaimList.Items) != expectedCount { + return false + } + + for _, nodeClaim := range nodeClaimList.Items { + _, conditionFound := lo.Find(nodeClaim.GetConditions(), func(condition apis.Condition) bool { + return condition.Type == apis.ConditionReady && condition.Status == v1.ConditionTrue + }) + if !conditionFound { + return false + } + } + return true + }, 20*time.Minute, PollInterval).Should(gomega.BeTrue(), "Failed to wait for nodeClaim to be ready") + }) +} + +// GetAllValidNodeClaims get all valid nodeClaims. +func GetAllValidNodeClaims(ctx context.Context, workspaceObj *v1alpha1.Workspace) (*v1beta1.NodeClaimList, error) { + nodeClaimList := &v1beta1.NodeClaimList{} + ls := labels.Set{ + v1alpha1.LabelWorkspaceName: workspaceObj.Name, + v1alpha1.LabelWorkspaceNamespace: workspaceObj.Namespace, + } + + err := TestingCluster.KubeClient.List(ctx, nodeClaimList, &client.MatchingLabelsSelector{Selector: ls.AsSelector()}) + if err != nil { + return nil, err + } + return nodeClaimList, nil +} diff --git a/test/e2e/webhook_test.go b/test/e2e/webhook_test.go index 49e13c2a3..e3b05b8b5 100644 --- a/test/e2e/webhook_test.go +++ b/test/e2e/webhook_test.go @@ -38,7 +38,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a workspace with invalid instancetype", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(HaveOccurred(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -53,7 +53,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a workspace with invalid preset name", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -68,7 +68,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a workspace with nil input", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(HaveOccurred(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -83,7 +83,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a workspace with nil output", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(HaveOccurred(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -98,7 +98,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a workspace with nil preset", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(HaveOccurred(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -116,7 +116,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a valid workspace", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(Succeed(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -126,7 +126,7 @@ var _ = Describe("Workspace Validation Webhook", func() { updatedObj.Resource.LabelSelector = &metav1.LabelSelector{} // update workspace Eventually(func() error { - return TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) + return utils.TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to update workspace %s", updatedObj.Name) }) @@ -136,7 +136,7 @@ var _ = Describe("Workspace Validation Webhook", func() { updatedObj.Resource.InstanceType = "Standard_NC12" // update workspace Eventually(func() error { - return TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) + return utils.TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to update workspace %s", updatedObj.Name) }) @@ -145,7 +145,7 @@ var _ = Describe("Workspace Validation Webhook", func() { // delete workspace Eventually(func() error { - return TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) + return utils.TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to delete workspace") }) @@ -159,7 +159,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a valid tuning workspace", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(Succeed(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -169,7 +169,7 @@ var _ = Describe("Workspace Validation Webhook", func() { updatedObj.Tuning.Preset = updatedPresetSpec // update workspace Eventually(func() error { - return TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) + return utils.TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to update workspace %s", updatedObj.Name) }) @@ -179,14 +179,14 @@ var _ = Describe("Workspace Validation Webhook", func() { updatedObj.Tuning.Method = alternativeTuningMethod // update workspace Eventually(func() error { - return TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) + return utils.TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to update workspace %s", updatedObj.Name) }) // delete workspace Eventually(func() error { - return TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) + return utils.TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to delete workspace") }) @@ -200,7 +200,7 @@ var _ = Describe("Workspace Validation Webhook", func() { By("Creating a valid workspace", func() { // Create workspace Eventually(func() error { - return TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) }, 20*time.Minute, utils.PollInterval). Should(Succeed(), "Failed to create workspace %s", workspaceObj.Name) }) @@ -210,7 +210,7 @@ var _ = Describe("Workspace Validation Webhook", func() { updatedObj.Inference.Preset.Name = PresetFalcon40BModel // update workspace Eventually(func() error { - return TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) + return utils.TestingCluster.KubeClient.Update(ctx, updatedObj, &client.UpdateOptions{}) }, utils.PollTimeout, utils.PollInterval). Should(HaveOccurred(), "Failed to update workspace %s", updatedObj.Name) }) @@ -219,7 +219,7 @@ var _ = Describe("Workspace Validation Webhook", func() { // delete workspace Eventually(func() error { - return TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) + return utils.TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to delete workspace") })