Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LeaderWorkerSet integration #3515

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Makefile-deps.mk
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,14 @@ cluster-autoscaler-crd: ## Copy the CRDs from the cluster-autoscaler to the dep-
mkdir -p $(EXTERNAL_CRDS_DIR)/cluster-autoscaler/
cp -f $(CLUSTER_AUTOSCALER_ROOT)/config/crd/* $(EXTERNAL_CRDS_DIR)/cluster-autoscaler/

LEADERWORKERSET_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/lws)
.PHONY: leaderworkerset-operator-crd
leaderworkerset-operator-crd: ## Copy the CRDs from the leaderworkerset-operator to the dep-crds directory.
mkdir -p $(EXTERNAL_CRDS_DIR)/leaderworkerset-operator/
cp -f $(LEADERWORKERSET_ROOT)/config/crd/bases/* $(EXTERNAL_CRDS_DIR)/leaderworkerset-operator/

.PHONY: dep-crds
dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd cluster-autoscaler-crd kf-training-operator-manifests ## Copy the CRDs from the external operators to the dep-crds directory.
dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd leaderworkerset-operator-crd cluster-autoscaler-crd kf-training-operator-manifests ## Copy the CRDs from the external operators to the dep-crds directory.
@echo "Copying CRDs from external operators to dep-crds directory"

.PHONY: kueuectl-docs
Expand Down
3 changes: 2 additions & 1 deletion Makefile-test.mk
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
JOBSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/jobset)
KUBEFLOW_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/training-operator)
KUBEFLOW_MPI_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/mpi-operator)
LEADERWORKERSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/lws)

##@ Tests

Expand Down Expand Up @@ -108,7 +109,7 @@ run-test-e2e-%: FORCE
@echo Running e2e for k8s ${K8S_VERSION}
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) \
ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
JOBSET_VERSION=$(JOBSET_VERSION) \
JOBSET_VERSION=$(JOBSET_VERSION) LEADERWORKERSET_VERSION=$(LEADERWORKERSET_VERSION) \
KIND_CLUSTER_FILE="kind-cluster.yaml" E2E_TARGET_FOLDER="singlecluster" \
./hack/e2e-test.sh
$(PROJECT_DIR)/bin/ginkgo-top -i $(ARTIFACTS)/$@/e2e.json > $(ARTIFACTS)/$@/e2e-top.yaml
Expand Down
1 change: 1 addition & 0 deletions apis/config/v1beta1/configuration_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ type Integrations struct {
// - "pod"
// - "deployment" (requires enabling pod integration)
// - "statefulset" (requires enabling pod integration)
// - "leaderworkerset" (requires enabling pod integration)
Frameworks []string `json:"frameworks,omitempty"`
// List of GroupVersionKinds that are managed for Kueue by external controllers;
// the expected format is `Kind.version.group.com`.
Expand Down
8 changes: 8 additions & 0 deletions charts/kueue/templates/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,14 @@ rules:
- list
- update
- watch
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- get
- list
- watch
- apiGroups:
- node.k8s.io
resources:
Expand Down
40 changes: 40 additions & 0 deletions charts/kueue/templates/webhook/webhook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,26 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /mutate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: mleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down Expand Up @@ -562,6 +582,26 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "kueue.fullname" . }}-webhook-service'
namespace: '{{ .Release.Namespace }}'
path: /validate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: vleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down
1 change: 1 addition & 0 deletions config/components/manager/controller_manager_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ integrations:
# - "pod"
# - "deployment" # requires enabling pod integration
# - "statefulset" # requires enabling pod integration
# - "leaderworkerset" # requires enabling pod integration
# externalFrameworks:
# - "Foo.v1.example.com"
# podOptions:
Expand Down
8 changes: 8 additions & 0 deletions config/components/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,14 @@ rules:
- list
- update
- watch
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- get
- list
- watch
- apiGroups:
- node.k8s.io
resources:
Expand Down
40 changes: 40 additions & 0 deletions config/components/webhook/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,26 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /mutate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: mleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down Expand Up @@ -496,6 +516,26 @@ webhooks:
resources:
- xgboostjobs
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate-leaderworkerset-x-k8s-io-v1-leaderworkerset
failurePolicy: Fail
name: vleaderworkerset.kb.io
rules:
- apiGroups:
- leaderworkerset.x-k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- leaderworkersets
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/controller-runtime v0.19.3
sigs.k8s.io/jobset v0.7.2
sigs.k8s.io/lws v0.4.2
sigs.k8s.io/structured-merge-diff/v4 v4.5.0
sigs.k8s.io/yaml v1.4.0
)
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,8 @@ sigs.k8s.io/kustomize/api v0.17.3 h1:6GCuHSsxq7fN5yhF2XrC+AAr8gxQwhexgHflOAD/JJU
sigs.k8s.io/kustomize/api v0.17.3/go.mod h1:TuDH4mdx7jTfK61SQ/j1QZM/QWR+5rmEiNjvYlhzFhc=
sigs.k8s.io/kustomize/kyaml v0.17.2 h1:+AzvoJUY0kq4QAhH/ydPHHMRLijtUKiyVyh7fOSshr0=
sigs.k8s.io/kustomize/kyaml v0.17.2/go.mod h1:9V0mCjIEYjlXuCdYsSXvyoy2BTsLESH7TlGV81S282U=
sigs.k8s.io/lws v0.4.2 h1:ItVhr38I3wv5qWAd2+pvffG0zXthY+c8ebxqMnP0KcU=
sigs.k8s.io/lws v0.4.2/go.mod h1:tENN6Die5OvJlJxjOijeRi/Y+F3bs5ZX1z3VyiCEEH0=
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk=
sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
Expand Down
15 changes: 15 additions & 0 deletions hack/e2e-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ if [[ -n ${KUBEFLOW_MPI_VERSION:-} ]]; then
export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v}
fi

if [[ -n ${LEADERWORKERSET_VERSION:-} ]]; then
export LEADERWORKERSET_MANIFEST="https://github.com/kubernetes-sigs/lws/releases/download/${LEADERWORKERSET_VERSION}/manifests.yaml"
export LEADERWORKERSET_IMAGE=registry.k8s.io/lws/lws:${LEADERWORKERSET_VERSION}
fi

# sleep image to use for testing.
export E2E_TEST_SLEEP_IMAGE_OLD=gcr.io/k8s-staging-perf-tests/sleep:v0.0.3@sha256:00ae8e01dd4439edfb7eb9f1960ac28eba16e952956320cce7f2ac08e3446e6b
E2E_TEST_SLEEP_IMAGE_OLD_WITHOUT_SHA=${E2E_TEST_SLEEP_IMAGE_OLD%%@*}
Expand Down Expand Up @@ -89,6 +94,9 @@ function prepare_docker_images {
if [[ -n ${KUBEFLOW_MPI_VERSION:-} ]]; then
docker pull "${KUBEFLOW_MPI_IMAGE}"
fi
if [[ -n ${LEADERWORKERSET_VERSION:-} ]]; then
docker pull "${LEADERWORKERSET_IMAGE}"
fi
}

# $1 cluster
Expand Down Expand Up @@ -136,6 +144,13 @@ function install_mpi {
kubectl apply --server-side -f "${KUBEFLOW_MPI_MANIFEST}"
}

#$1 - cluster name
function install_lws {
cluster_kind_load_image "${1}" "${LEADERWORKERSET_IMAGE/#v}"
kubectl config use-context "kind-${1}"
kubectl apply --server-side -f "${LEADERWORKERSET_MANIFEST}"
}

INITIAL_IMAGE=$($YQ '.images[] | select(.name == "controller") | [.newName, .newTag] | join(":")' config/components/manager/kustomization.yaml)
export INITIAL_IMAGE

Expand Down
3 changes: 3 additions & 0 deletions hack/e2e-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ function kind_load {
if [[ -n ${KUBEFLOW_MPI_VERSION:-} ]]; then
install_mpi "$KIND_CLUSTER_NAME"
fi
if [[ -n ${LEADERWORKERSET_VERSION:-} ]]; then
install_lws "$KIND_CLUSTER_NAME"
fi
}

function kueue_deploy {
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
_ "sigs.k8s.io/kueue/pkg/controller/jobs/job"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/leaderworkerset"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/pod"
_ "sigs.k8s.io/kueue/pkg/controller/jobs/raycluster"
Expand Down
67 changes: 67 additions & 0 deletions pkg/controller/jobs/leaderworkerset/leaderworkerset_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package leaderworkerset

import (
"context"

"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"

"sigs.k8s.io/kueue/pkg/controller/jobframework"
)

var (
gvk = leaderworkersetv1.SchemeGroupVersion.WithKind("LeaderWorkerSet")
)

const (
FrameworkName = "leaderworkerset"
)

func init() {
utilruntime.Must(jobframework.RegisterIntegration(FrameworkName, jobframework.IntegrationCallbacks{
SetupIndexes: SetupIndexes,
NewReconciler: NewPodReconciler,
SetupWebhook: SetupWebhook,
JobType: &leaderworkersetv1.LeaderWorkerSet{},
AddToScheme: leaderworkersetv1.AddToScheme,
DependencyList: []string{"pod"},
GVK: gvk,
}))
}

type LeaderWorkerSet leaderworkersetv1.LeaderWorkerSet

func fromObject(o runtime.Object) *LeaderWorkerSet {
return (*LeaderWorkerSet)(o.(*leaderworkersetv1.LeaderWorkerSet))
}

func (lws *LeaderWorkerSet) Object() client.Object {
return (*leaderworkersetv1.LeaderWorkerSet)(lws)
}

func (lws *LeaderWorkerSet) GVK() schema.GroupVersionKind {
return gvk
}

func SetupIndexes(context.Context, client.FieldIndexer) error {
return nil
}
Loading