Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: remove gpu-provisioner chart from kaito repo #372

Merged
merged 2 commits into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,13 @@ jobs:
shell: bash
run: |
make gpu-provisioner-helm
kubectl wait --for=condition=available deploy "kaito-gpu-provisioner" -n gpu-provisioner --timeout=300s
kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }}
GPU_PROVISIONER_VERSION: ${{ vars.GPU_PROVISIONER_VERSION }}

- uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0
with:
Expand Down
39 changes: 15 additions & 24 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
REGISTRY ?= YOUR_REGISTRY
IMG_NAME ?= workspace
VERSION ?= v0.2.2
GPU_PROVISIONER_VERSION ?= 0.2.0
IMG_TAG ?= $(subst v,,$(VERSION))

ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
Expand Down Expand Up @@ -30,6 +31,8 @@ AZURE_CLUSTER_NAME ?= kaito-demo
AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION)
GPU_NAMESPACE ?= gpu-provisioner
KAITO_NAMESPACE ?= kaito-workspace
GPU_PROVISIONER_MSI_NAME ?= gpuIdentity

RUN_LLAMA_13B ?= false
AI_MODELS_REGISTRY ?= modelregistry.azurecr.io
AI_MODELS_REGISTRY_SECRET ?= modelregistry
Expand Down Expand Up @@ -191,37 +194,25 @@ ifndef ignore-not-found
endif

##@ gpu-provider
.PHONY: gpu-provisioner-identity-perm
gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner
az identity create --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP)

IDENTITY_PRINCIPAL_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId')
IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'clientId')

az role assignment create --assignee $(IDENTITY_PRINCIPAL_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor"
az identity create --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP)

AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl")
IDENTITY_PRINCIPAL_ID=$(shell az identity show --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId');\
az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor"

az identity federated-credential create --name gpu-federatecredential --identity-name gpuIdentity --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer "$(AKS_OIDC_ISSUER)" \
--subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID)
AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\
az identity federated-credential create --name gpu-federatecredential --identity-name $(GPU_PROVISIONER_MSI_NAME) --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \
--subject system:serviceaccount:"$(GPU_NAMESPACE):$(GPU_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID)

.PHONY: gpu-provisioner-helm
gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml
az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP)
$(eval IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --query 'clientId' -o tsv))
$(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId"))
$(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".id"))

yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="AZURE_NODE_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP_MC)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="AZURE_CLUSTER_NAME")) .value = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.settings.azure.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.workloadIdentity.clientId) = "$(IDENTITY_CLIENT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml
yq -i '(.workloadIdentity.tenantId) = "$(AZURE_TENANT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml

helm install kaito-gpu-provisioner ./charts/kaito/gpu-provisioner --namespace $(GPU_NAMESPACE) --create-namespace

curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh
chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME)

helm install $(GPU_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \
https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz

##@ Build Dependencies

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ Kaito follows the classic Kubernetes Custom Resource Definition(CRD)/controller
The above figure presents the Kaito architecture overview. Its major components consist of:

- **Workspace controller**: It reconciles the `workspace` custom resource, creates `machine` (explained below) custom resources to trigger node auto provisioning, and creates the inference workload (`deployment` or `statefulset`) based on the model preset configurations.
- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [Kaito helm chart](charts/kaito/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Kubernetes Service(AKS) APIs to add new GPU nodes to the AKS cluster.
Note that the *gpu-provisioner* is an open sourced component maintained in [this](https://github.com/Azure/gpu-provisioner) repository. It can be replaced by other controllers if they support Karpenter-core APIs.
- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Kubernetes Service(AKS) APIs to add new GPU nodes to the AKS cluster.
> Note: The [*gpu-provisioner*](https://github.com/Azure/gpu-provisioner) is an open sourced component. It can be replaced by other controllers if they support [Karpenter-core](https://sigs.k8s.io/karpenter) APIs.

## Installation

Expand Down
23 changes: 0 additions & 23 deletions charts/kaito/gpu-provisioner/.helmignore

This file was deleted.

13 changes: 0 additions & 13 deletions charts/kaito/gpu-provisioner/Chart.yaml

This file was deleted.

Loading
Loading