diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 6d32de20..3c307eec 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,6 +70,34 @@ jobs: run: make create-cluster fv env: FV: true + NEW_FV: + runs-on: ubuntu-latest + steps: + - name: checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Go + uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 + with: + go-version: 1.23.4 + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: fv + run: make create-cluster new-fv + env: + FV: true FV_SHARDING: runs-on: ubuntu-latest steps: diff --git a/Makefile b/Makefile index 68c29961..299d0e9c 100644 --- a/Makefile +++ b/Makefile @@ -247,6 +247,10 @@ kind-test: test create-cluster fv ## Build docker image; start kind cluster; loa fv: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster cd test/fv; $(GINKGO) -nodes $(NUM_NODES) --label-filter='FV' --v --trace --randomize-all +.PHONY: new-fv +new-fv: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster + cd test/fv; $(GINKGO) -nodes $(NUM_NODES) --label-filter='NEW-FV' --v --trace --randomize-all + .PHONY: fv-sharding fv-sharding: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster $(KUBECTL) patch cluster clusterapi-workload -n default --type json -p '[{ "op": "add", "path": "/metadata/annotations/sharding.projectsveltos.io~1key", "value": "shard1" }]' diff --git a/api/v1beta1/spec.go b/api/v1beta1/spec.go index af2408e6..6eb51936 100644 --- a/api/v1beta1/spec.go +++ b/api/v1beta1/spec.go @@ -638,6 +638,14 @@ type Spec struct { // +optional ContinueOnConflict bool `json:"continueOnConflict,omitempty"` + // By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + // encountering the first error. + // If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + // if errors are detected for previous resources. + // +kubebuilder:default:=false + // +optional + ContinueOnError bool `json:"continueOnError,omitempty"` + // The maximum number of clusters that can be updated concurrently. // Value can be an absolute number (ex: 5) or a percentage of desired cluster (ex: 10%). // Defaults to 100%. diff --git a/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml b/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml index 6a53c862..ae3d5f70 100644 --- a/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml +++ b/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml @@ -1073,6 +1073,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. diff --git a/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml b/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml index 314052a6..3c784875 100644 --- a/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml +++ b/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml @@ -1099,6 +1099,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. diff --git a/config/crd/bases/config.projectsveltos.io_profiles.yaml b/config/crd/bases/config.projectsveltos.io_profiles.yaml index 53088e83..780eee1b 100644 --- a/config/crd/bases/config.projectsveltos.io_profiles.yaml +++ b/config/crd/bases/config.projectsveltos.io_profiles.yaml @@ -1073,6 +1073,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. diff --git a/controllers/handlers_helm.go b/controllers/handlers_helm.go index 1ea6d648..f4b76691 100644 --- a/controllers/handlers_helm.go +++ b/controllers/handlers_helm.go @@ -515,6 +515,7 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c kubeconfig string, mgmtResources map[string]*unstructured.Unstructured, logger logr.Logger, ) ([]configv1beta1.ReleaseReport, []configv1beta1.Chart, error) { + errorMsg := "" conflictErrorMessage := "" releaseReports := make([]configv1beta1.ReleaseReport, 0, len(clusterSummary.Spec.ClusterProfileSpec.HelmCharts)) chartDeployed := make([]configv1beta1.Chart, 0, len(clusterSummary.Spec.ClusterProfileSpec.HelmCharts)) @@ -560,6 +561,11 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c var currentRelease *releaseInfo currentRelease, report, err = handleChart(ctx, clusterSummary, mgmtResources, instantiatedChart, kubeconfig, logger) if err != nil { + if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError { + errorMsg += fmt.Sprintf("chart: %s, release: %s, %v\n", + instantiatedChart.ChartName, instantiatedChart.ReleaseName, err) + continue + } return releaseReports, chartDeployed, err } @@ -588,6 +594,12 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c } } + // This has to come before conflictErrorMessage as conflictErrorMessage is not retriable + // while any other generic error is + if errorMsg != "" { + return releaseReports, chartDeployed, fmt.Errorf("%s", errorMsg) + } + if conflictErrorMessage != "" { // for helm chart a conflict is a non retriable error. // when profile currently managing the helm chart is removed, all @@ -1054,10 +1066,7 @@ func uninstallRelease(ctx context.Context, clusterSummary *configv1beta1.Cluster return err } - uninstallClient, err := getHelmUninstallClient(helmChart, actionConfig) - if err != nil { - return err - } + uninstallClient := getHelmUninstallClient(helmChart, actionConfig) _, err = uninstallClient.Run(releaseName) if err != nil { @@ -1112,11 +1121,7 @@ func upgradeRelease(ctx context.Context, clusterSummary *configv1beta1.ClusterSu patches = append(patches, driftExclusionPatches...) - upgradeClient, err := getHelmUpgradeClient(requestedChart, actionConfig, patches) - if err != nil { - logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to get helm upgrade client: %v", err)) - return err - } + upgradeClient := getHelmUpgradeClient(requestedChart, actionConfig, patches) cp, err := upgradeClient.ChartPathOptions.LocateChart(chartName, settings) if err != nil { @@ -2241,10 +2246,7 @@ func getHelmInstallClient(requestedChart *configv1beta1.HelmChart, kubeconfig st installClient.DisableHooks = getDisableHooksHelmInstallValue(requestedChart.Options) installClient.DisableOpenAPIValidation = getDisableOpenAPIValidationValue(requestedChart.Options) if timeout := getTimeoutValue(requestedChart.Options); timeout != nil { - installClient.Timeout, err = time.ParseDuration(timeout.String()) - if err != nil { - return nil, err - } + installClient.Timeout = timeout.Duration } installClient.SkipSchemaValidation = getSkipSchemaValidation(requestedChart.Options) installClient.Replace = getReplaceValue(requestedChart.Options) @@ -2262,7 +2264,7 @@ func getHelmInstallClient(requestedChart *configv1beta1.HelmChart, kubeconfig st } func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration, - patches []libsveltosv1beta1.Patch) (*action.Upgrade, error) { + patches []libsveltosv1beta1.Patch) *action.Upgrade { upgradeClient := action.NewUpgrade(actionConfig) upgradeClient.Install = true @@ -2275,11 +2277,7 @@ func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig upgradeClient.DisableHooks = getDisableHooksHelmUpgradeValue(requestedChart.Options) upgradeClient.DisableOpenAPIValidation = getDisableOpenAPIValidationValue(requestedChart.Options) if timeout := getTimeoutValue(requestedChart.Options); timeout != nil { - var err error - upgradeClient.Timeout, err = time.ParseDuration(timeout.String()) - if err != nil { - return nil, err - } + upgradeClient.Timeout = timeout.Duration } upgradeClient.SkipSchemaValidation = getSkipSchemaValidation(requestedChart.Options) upgradeClient.ResetValues = getResetValues(requestedChart.Options) @@ -2301,19 +2299,17 @@ func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig upgradeClient.PostRenderer = &patcher.CustomPatchPostRenderer{Patches: patches} } - return upgradeClient, nil + return upgradeClient } -func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration) (*action.Uninstall, error) { +func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration, +) *action.Uninstall { + uninstallClient := action.NewUninstall(actionConfig) uninstallClient.DryRun = false if requestedChart != nil { if timeout := getTimeoutValue(requestedChart.Options); timeout != nil { - var err error - uninstallClient.Timeout, err = time.ParseDuration(timeout.String()) - if err != nil { - return nil, err - } + uninstallClient.Timeout = timeout.Duration } uninstallClient.Description = getDescriptionValue(requestedChart.Options) @@ -2322,7 +2318,7 @@ func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfi uninstallClient.KeepHistory = getKeepHistoryValue(requestedChart.Options) uninstallClient.DeletionPropagation = getDeletionPropagation(requestedChart.Options) } - return uninstallClient, nil + return uninstallClient } func addExtraMetadata(ctx context.Context, requestedChart *configv1beta1.HelmChart, diff --git a/controllers/handlers_utils.go b/controllers/handlers_utils.go index 725ca246..45355a41 100644 --- a/controllers/handlers_utils.go +++ b/controllers/handlers_utils.go @@ -448,12 +448,17 @@ func deployUnstructured(ctx context.Context, deployingToMgmtCluster bool, destCo } conflictErrorMsg := "" + errorMsg := "" reports = make([]configv1beta1.ResourceReport, 0) for i := range referencedUnstructured { policy := referencedUnstructured[i] err := adjustNamespace(policy, destConfig) if err != nil { + if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError { + errorMsg += fmt.Sprintf("%v", err) + continue + } return nil, err } @@ -524,22 +529,38 @@ func deployUnstructured(ctx context.Context, deployingToMgmtCluster bool, destCo } policy, err = updateResource(ctx, dr, clusterSummary, policy, subresources, logger) + resource.LastAppliedTime = &metav1.Time{Time: time.Now()} + reports = append(reports, *generateResourceReport(policyHash, resourceInfo, policy, resource)) if err != nil { + if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError { + errorMsg += fmt.Sprintf("%v", err) + continue + } return reports, err } - - resource.LastAppliedTime = &metav1.Time{Time: time.Now()} - reports = append(reports, *generateResourceReport(policyHash, resourceInfo, policy, resource)) } + return reports, handleDeployUnstructuredErrors(conflictErrorMsg, errorMsg, clusterSummary) +} + +func handleDeployUnstructuredErrors(conflictErrorMsg, errorMsg string, clusterSummary *configv1beta1.ClusterSummary, +) error { + if conflictErrorMsg != "" { if clusterSummary.Spec.ClusterProfileSpec.SyncMode != configv1beta1.SyncModeDryRun { // if in DryRun mode, ignore conflicts - return reports, deployer.NewConflictError(conflictErrorMsg) + return deployer.NewConflictError(conflictErrorMsg) } } - return reports, nil + if errorMsg != "" { + if clusterSummary.Spec.ClusterProfileSpec.SyncMode != configv1beta1.SyncModeDryRun { + // if in DryRun mode, ignore errors + return fmt.Errorf("%s", errorMsg) + } + } + + return nil } func addMetadata(policy *unstructured.Unstructured, resourceVersion string, profile client.Object, diff --git a/manifest/manifest.yaml b/manifest/manifest.yaml index 5c993b70..f0057ba2 100644 --- a/manifest/manifest.yaml +++ b/manifest/manifest.yaml @@ -1854,6 +1854,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. @@ -4688,6 +4696,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. @@ -6829,6 +6845,14 @@ spec: If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even if conflicts are detected for previous resources. type: boolean + continueOnError: + default: false + description: |- + By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after + encountering the first error. + If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even + if errors are detected for previous resources. + type: boolean dependsOn: description: |- DependsOn specifies a list of other ClusterProfiles that this instance depends on. diff --git a/test/fv/continue_on_error_test.go b/test/fv/continue_on_error_test.go new file mode 100644 index 00000000..f35b45be --- /dev/null +++ b/test/fv/continue_on_error_test.go @@ -0,0 +1,192 @@ +/* +Copyright 2024. projectsveltos.io. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fv_test + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1" + "github.com/projectsveltos/addon-controller/controllers" + libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" +) + +const ( + validNamespace = `kind: Namespace +apiVersion: v1 +metadata: + name: %s` + + invalidNamespace = `kind: Namespace1 +apiVersion: v1 +metadata: + name: nginx` + + validDeployment = `apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-deployment + namespace: %s +spec: + replicas: 2 # number of pods to run + selector: + matchLabels: + app: nginx + template: + metadata: + labels: + app: nginx + spec: + containers: + - name: nginx + image: nginx:latest + ports: + - containerPort: 80` +) + +var _ = Describe("Feature", Serial, func() { + const ( + namePrefix = "continue-on-error-" + ) + + It("With ContinueOnError set, Sveltos keeps deploying after an error", Label("NEW-FV"), func() { + configMapNs := randomString() + Byf("Create configMap's namespace %s", configMapNs) + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapNs, + }, + } + Expect(k8sClient.Create(context.TODO(), ns)).To(Succeed()) + Byf("Create a configMap with valid and invalid resources") + resourceNamespace := randomString() + configMap := createConfigMapWithPolicy(configMapNs, namePrefix+randomString(), + fmt.Sprintf(validNamespace, resourceNamespace), + invalidNamespace, + fmt.Sprintf(validDeployment, resourceNamespace)) + Expect(k8sClient.Create(context.TODO(), configMap)).To(Succeed()) + currentConfigMap := &corev1.ConfigMap{} + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Namespace: configMap.Namespace, Name: configMap.Name}, currentConfigMap)).To(Succeed()) + + Byf("Create a ClusterProfile matching Cluster %s/%s", kindWorkloadCluster.Namespace, kindWorkloadCluster.Name) + clusterProfile := getClusterProfile(namePrefix, map[string]string{key: value}) + clusterProfile.Spec.SyncMode = configv1beta1.SyncModeContinuous + Expect(k8sClient.Create(context.TODO(), clusterProfile)).To(Succeed()) + + verifyClusterProfileMatches(clusterProfile) + + verifyClusterSummary(controllers.ClusterProfileLabelName, + clusterProfile.Name, &clusterProfile.Spec, kindWorkloadCluster.Namespace, kindWorkloadCluster.Name) + + Byf("Update ClusterProfile %s to deploy helm charts and referencing ConfigMap %s/%s", + clusterProfile.Name, configMap.Namespace, configMap.Name) + currentClusterProfile := &configv1beta1.ClusterProfile{} + Expect(k8sClient.Get(context.TODO(), types.NamespacedName{Name: clusterProfile.Name}, + currentClusterProfile)).To(Succeed()) + + // Cert-manager installation will fails as CRDs are not present and we are not deploying those + // ALso sets timeout otherwise helm takes too long before giving up on cert-manager failures (due to CRDs not being installed) + const two = 2 + timeout := metav1.Duration{Duration: two * time.Minute} + currentClusterProfile.Spec.HelmCharts = []configv1beta1.HelmChart{ + { + RepositoryURL: "https://charts.konghq.com", + RepositoryName: "kong", + ChartName: "kong/kong", + ChartVersion: "2.46.0", + ReleaseName: "kong", + ReleaseNamespace: "kong", + HelmChartAction: configv1beta1.HelmChartActionInstall, + }, + { + RepositoryURL: "https://charts.jetstack.io", + RepositoryName: "jetstack", + ChartName: "jetstack/cert-manager", + ChartVersion: "v1.16.2", + ReleaseName: "cert-manager", + ReleaseNamespace: "cert-manager", + HelmChartAction: configv1beta1.HelmChartActionInstall, + Options: &configv1beta1.HelmOptions{ + Timeout: &timeout, + }, + }, + { + RepositoryURL: "https://kyverno.github.io/kyverno/", + RepositoryName: "kyverno", + ChartName: "kyverno/kyverno", + ChartVersion: "v3.3.4", + ReleaseName: "kyverno-latest", + ReleaseNamespace: "kyverno", + HelmChartAction: configv1beta1.HelmChartActionInstall, + }, + { + RepositoryURL: "https://helm.nginx.com/stable/", + RepositoryName: "nginx-stable", + ChartName: "nginx-stable/nginx-ingress", + ChartVersion: "1.3.1", + ReleaseName: "nginx-latest", + ReleaseNamespace: "nginx", + HelmChartAction: configv1beta1.HelmChartActionInstall, + }, + } + currentClusterProfile.Spec.PolicyRefs = []configv1beta1.PolicyRef{ + { + Kind: string(libsveltosv1beta1.ConfigMapReferencedResourceKind), + Namespace: configMap.Namespace, + Name: configMap.Name, + }, + } + Byf("Setting ClusterProfile %s Spec.ContinueOnError true", clusterProfile.Name) + currentClusterProfile.Spec.ContinueOnError = true + + Expect(k8sClient.Update(context.TODO(), currentClusterProfile)).To(Succeed()) + + clusterSummary := verifyClusterSummary(controllers.ClusterProfileLabelName, + currentClusterProfile.Name, ¤tClusterProfile.Spec, + kindWorkloadCluster.Namespace, kindWorkloadCluster.Name) + + charts := []configv1beta1.Chart{ + {ReleaseName: "kyverno-latest", ChartVersion: "3.3.4", Namespace: "kyverno"}, + {ReleaseName: "kong", ChartVersion: "2.46.0", Namespace: "kong"}, + {ReleaseName: "nginx-latest", ChartVersion: "1.3.1", Namespace: "nginx"}, + } + + verifyClusterConfiguration(configv1beta1.ClusterProfileKind, clusterProfile.Name, + clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, configv1beta1.FeatureHelm, + nil, charts) + + policies := []policy{ + {kind: "Namespace", name: resourceNamespace, namespace: "", group: ""}, + {kind: "Deployment", name: "nginx-deployment", namespace: resourceNamespace, group: "apps"}, + } + By("MGIANLUC") + verifyClusterConfiguration(configv1beta1.ClusterProfileKind, clusterProfile.Name, + clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, configv1beta1.FeatureResources, + policies, nil) + + deleteClusterProfile(clusterProfile) + }) +})