Skip to content

Commit

Permalink
LeaderWorkerSet integration
Browse files Browse the repository at this point in the history
  • Loading branch information
vladikkuzn committed Nov 19, 2024
1 parent b0665c7 commit 3036113
Show file tree
Hide file tree
Showing 25 changed files with 1,690 additions and 64 deletions.
8 changes: 7 additions & 1 deletion Makefile-deps.mk
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,14 @@ cluster-autoscaler-crd: ## Copy the CRDs from the cluster-autoscaler to the dep-
mkdir -p $(EXTERNAL_CRDS_DIR)/cluster-autoscaler/
cp -f $(CLUSTER_AUTOSCALER_ROOT)/config/crd/* $(EXTERNAL_CRDS_DIR)/cluster-autoscaler/

LEADERWORKERSET_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/lws)
.PHONY: leaderworkerset-operator-crd
leaderworkerset-operator-crd: ## Copy the CRDs from the leaderworkerset-operator to the dep-crds directory.
mkdir -p $(EXTERNAL_CRDS_DIR)/leaderworkerset-operator/
cp -f $(LEADERWORKERSET_ROOT)/config/crd/bases/* $(EXTERNAL_CRDS_DIR)/leaderworkerset-operator/

.PHONY: dep-crds
dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd cluster-autoscaler-crd kf-training-operator-manifests ## Copy the CRDs from the external operators to the dep-crds directory.
dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd leaderworkerset-operator-crd cluster-autoscaler-crd kf-training-operator-manifests ## Copy the CRDs from the external operators to the dep-crds directory.
@echo "Copying CRDs from external operators to dep-crds directory"

.PHONY: kueuectl-docs
Expand Down
10 changes: 5 additions & 5 deletions Makefile-test.mk
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
JOBSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/jobset)
KUBEFLOW_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/training-operator)
KUBEFLOW_MPI_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/mpi-operator)
LEADERWORKERSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/lws)

##@ Tests

Expand Down Expand Up @@ -107,24 +108,23 @@ run-test-e2e-%: K8S_VERSION = $(@:run-test-e2e-%=%)
run-test-e2e-%: FORCE
@echo Running e2e for k8s ${K8S_VERSION}
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) KIND_CLUSTER_FILE="kind-cluster.yaml" E2E_TARGET_FOLDER="singlecluster" ./hack/e2e-test.sh
JOBSET_VERSION=$(JOBSET_VERSION) LEADERWORKERSET_VERSION=$(LEADERWORKERSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) KIND_CLUSTER_FILE="kind-cluster.yaml" E2E_TARGET_FOLDER="singlecluster" ./hack/e2e-test.sh
$(PROJECT_DIR)/bin/ginkgo-top -i $(ARTIFACTS)/$@/e2e.json > $(ARTIFACTS)/$@/e2e-top.yaml

run-test-multikueue-e2e-%: K8S_VERSION = $(@:run-test-multikueue-e2e-%=%)
run-test-multikueue-e2e-%: FORCE
@echo Running multikueue e2e for k8s ${K8S_VERSION}
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) ./hack/multikueue-e2e-test.sh
JOBSET_VERSION=$(JOBSET_VERSION) LEADERWORKERSET_VERSION=$(LEADERWORKERSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) ./hack/multikueue-e2e-test.sh
$(PROJECT_DIR)/bin/ginkgo-top -i $(ARTIFACTS)/$@/e2e.json > $(ARTIFACTS)/$@/e2e-top.yaml

run-test-tas-e2e-%: K8S_VERSION = $(@:run-test-tas-e2e-%=%)
run-test-tas-e2e-%: FORCE
run-test-tas-e2e-%: K8S_VERSION = $(@:run-test-tas-e2e-%=%)
run-test-tas-e2e-%: FORCE
@echo Running tas e2e for k8s ${K8S_VERSION}
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) KIND_CLUSTER_FILE="tas-kind-cluster.yaml" E2E_TARGET_FOLDER="tas" ./hack/e2e-test.sh
$(PROJECT_DIR)/bin/ginkgo-top -i $(ARTIFACTS)/$@/e2e.json > $(ARTIFACTS)/$@/e2e-top.yaml


SCALABILITY_RUNNER := $(PROJECT_DIR)/bin/performance-scheduler-runner
.PHONY: performance-scheduler-runner
performance-scheduler-runner:
Expand Down
1 change: 1 addition & 0 deletions apis/config/v1beta1/configuration_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ type Integrations struct {
// - "pod"
// - "deployment" (requires enabling pod integration)
// - "statefulset" (requires enabling pod integration)
// - "leaderworkerset" (requires enabling pod integration)
Frameworks []string `json:"frameworks,omitempty"`
// List of GroupVersionKinds that are managed for Kueue by external controllers;
// the expected format is `Kind.version.group.com`.
Expand Down
117 changes: 78 additions & 39 deletions charts/kueue/templates/webhook/webhook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -174,25 +174,6 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /mutate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: mmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down Expand Up @@ -227,6 +208,44 @@ webhooks:
resources:
- pods
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /mutate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: mleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /mutate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: mmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down Expand Up @@ -526,26 +545,6 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /validate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: vmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
- UPDATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down Expand Up @@ -581,6 +580,46 @@ webhooks:
resources:
- pods
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /validate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: vleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /validate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: vmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
- UPDATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down
1 change: 1 addition & 0 deletions config/components/manager/controller_manager_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ integrations:
# - "pod"
# - "deployment" # requires enabling pod integration
# - "statefulset" # requires enabling pod integration
# - "leaderworkerset" # requires enabling pod integration
# externalFrameworks:
# - "Foo.v1.example.com"
# podOptions:
Expand Down
75 changes: 57 additions & 18 deletions config/components/webhook/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,37 +162,56 @@ webhooks:
service:
name: webhook-service
namespace: system
path: /mutate-kubeflow-org-v2beta1-mpijob
path: /mutate--v1-pod
failurePolicy: Fail
name: mmpijob.kb.io
name: mpod.kb.io
rules:
- apiGroups:
- kubeflow.org
- ""
apiVersions:
- v2beta1
- v1
operations:
- CREATE
resources:
- mpijobs
- pods
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /mutate--v1-pod
path: /mutate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: mpod.kb.io
name: mleaderworkerset.kb.io
rules:
- apiGroups:
- ""
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
resources:
- pods
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /mutate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: mmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
Expand Down Expand Up @@ -481,39 +500,59 @@ webhooks:
service:
name: webhook-service
namespace: system
path: /validate-kubeflow-org-v2beta1-mpijob
path: /validate--v1-pod
failurePolicy: Fail
name: vmpijob.kb.io
name: vpod.kb.io
rules:
- apiGroups:
- kubeflow.org
- ""
apiVersions:
- v2beta1
- v1
operations:
- CREATE
- UPDATE
resources:
- mpijobs
- pods
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate--v1-pod
path: /validate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: vpod.kb.io
name: vleaderworkerset.kb.io
rules:
- apiGroups:
- ""
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- pods
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate-kubeflow-org-v2beta1-mpijob
failurePolicy: Fail
name: vmpijob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v2beta1
operations:
- CREATE
- UPDATE
resources:
- mpijobs
sideEffects: None
- admissionReviewVersions:
- v1
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/controller-runtime v0.19.1
sigs.k8s.io/jobset v0.7.1
sigs.k8s.io/lws v0.4.1
sigs.k8s.io/structured-merge-diff/v4 v4.4.3
sigs.k8s.io/yaml v1.4.0
)
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,8 @@ sigs.k8s.io/kustomize/api v0.17.3 h1:6GCuHSsxq7fN5yhF2XrC+AAr8gxQwhexgHflOAD/JJU
sigs.k8s.io/kustomize/api v0.17.3/go.mod h1:TuDH4mdx7jTfK61SQ/j1QZM/QWR+5rmEiNjvYlhzFhc=
sigs.k8s.io/kustomize/kyaml v0.17.2 h1:+AzvoJUY0kq4QAhH/ydPHHMRLijtUKiyVyh7fOSshr0=
sigs.k8s.io/kustomize/kyaml v0.17.2/go.mod h1:9V0mCjIEYjlXuCdYsSXvyoy2BTsLESH7TlGV81S282U=
sigs.k8s.io/lws v0.4.1 h1:3MPW6FTsxRVhtc8tbw8VY5+jh5FSRcvir1SOLvxO8F8=
sigs.k8s.io/lws v0.4.1/go.mod h1:tENN6Die5OvJlJxjOijeRi/Y+F3bs5ZX1z3VyiCEEH0=
sigs.k8s.io/structured-merge-diff/v4 v4.4.3 h1:sCP7Vv3xx/CWIuTPVN38lUPx0uw0lcLfzaiDa8Ja01A=
sigs.k8s.io/structured-merge-diff/v4 v4.4.3/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
Expand Down
10 changes: 10 additions & 0 deletions hack/e2e-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ export KUBEFLOW_IMAGE=kubeflow/training-operator:${KUBEFLOW_IMAGE_VERSION}
export KUBEFLOW_MPI_MANIFEST="https://raw.githubusercontent.com/kubeflow/mpi-operator/${KUBEFLOW_MPI_VERSION}/deploy/v2beta1/mpi-operator.yaml"
export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v}

export LEADERWORKERSET_MANIFEST="https://github.com/kubernetes-sigs/lws/releases/download/${LEADERWORKERSET_VERSION}/manifests.yaml"
export LEADERWORKERSET_IMAGE=registry.k8s.io/lws/lws:${LEADERWORKERSET_VERSION}

# sleep image to use for testing.
export E2E_TEST_IMAGE=gcr.io/k8s-staging-perf-tests/sleep:v0.1.0@sha256:8d91ddf9f145b66475efda1a1b52269be542292891b5de2a7fad944052bab6ea

Expand Down Expand Up @@ -104,6 +107,13 @@ function install_mpi {
kubectl apply --server-side -f "${KUBEFLOW_MPI_MANIFEST}"
}

#$1 - cluster name
function install_leaderworkerset {
cluster_kind_load_image "${1}" "${LEADERWORKERSET_IMAGE/#v}"
kubectl config use-context "kind-${1}"
kubectl apply --server-side -f "${LEADERWORKERSET_MANIFEST}"
}

INITIAL_IMAGE=$($YQ '.images[] | select(.name == "controller") | [.newName, .newTag] | join(":")' config/components/manager/kustomization.yaml)
export INITIAL_IMAGE

Expand Down
2 changes: 2 additions & 0 deletions hack/e2e-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ function kind_load {
fi
docker pull "registry.k8s.io/jobset/jobset:$JOBSET_VERSION"
install_jobset "$KIND_CLUSTER_NAME"
docker pull "registry.k8s.io/lws/lws:${LEADERWORKERSET_VERSION}"
install_leaderworkerset "$KIND_CLUSTER_NAME"
}

function kueue_deploy {
Expand Down
Loading

0 comments on commit 3036113

Please sign in to comment.