(feat) introduce ContinueOnError flag

Sveltos, by default, deploys Helm charts and resources in the order they appear in a ClusterProfile/Profile, retrying on errors without proceeding to subsequent deployments. For example, if a ClusterProfile lists three Helm charts and the second fails to deploy, Sveltos will not attempt to deploy the third. However, enabling the `Spec.ContinueOnError` setting allows Sveltos to proceed with deploying the third Helm chart and then retry the failed second chart. This pull request introduces a new Functional Verification (FV) environment to support testing on a fresh cluster. Reusing Helm charts across tests creates excessive inter-test dependencies. For example, if test A fails to uninstall Kyverno, subsequent tests (like test B, which also uses Kyverno) will encounter Sveltos conflict errors, even when executed sequentially.
projectsveltos · Jan 13, 2025 · 4d7a1eb · 4d7a1eb
1 parent 6e5cfd7
commit 4d7a1eb
Show file tree

Hide file tree

Showing 10 changed files with 329 additions and 32 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -70,6 +70,34 @@ jobs:
       run: make create-cluster fv
       env:
         FV: true
+  NEW_FV:
+    runs-on: ubuntu-latest
+    steps:
+    - name: checkout
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up Go
+      uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0
+      with:
+        go-version: 1.23.4
+    - name: Free Disk Space (Ubuntu)
+      uses: jlumbroso/free-disk-space@main
+      with:
+        # this might remove tools that are actually needed,
+        # if set to "true" but frees about 6 GB
+        tool-cache: false
+
+        # all of these default to true, but feel free to set to
+        # "false" if necessary for your workflow
+        android: true
+        dotnet: true
+        haskell: true
+        large-packages: true
+        docker-images: true
+        swap-storage: true
+    - name: fv
+      run: make create-cluster new-fv
+      env:
+        FV: true        
   FV_SHARDING:
     runs-on: ubuntu-latest
     steps:

diff --git a/Makefile b/Makefile
@@ -247,6 +247,10 @@ kind-test: test create-cluster fv ## Build docker image; start kind cluster; loa
 fv: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster
 	cd test/fv; $(GINKGO) -nodes $(NUM_NODES) --label-filter='FV' --v --trace --randomize-all
 
+.PHONY: new-fv
+new-fv: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster
+	cd test/fv; $(GINKGO) -nodes $(NUM_NODES) --label-filter='NEW-FV' --v --trace --randomize-all
+
 .PHONY: fv-sharding
 fv-sharding: $(KUBECTL) $(GINKGO) ## Run Sveltos Controller tests using existing cluster
 	$(KUBECTL) patch cluster clusterapi-workload  -n default --type json -p '[{ "op": "add", "path": "/metadata/annotations/sharding.projectsveltos.io~1key", "value": "shard1" }]'

diff --git a/api/v1beta1/spec.go b/api/v1beta1/spec.go
@@ -638,6 +638,14 @@ type Spec struct {
 	// +optional
 	ContinueOnConflict bool `json:"continueOnConflict,omitempty"`
 
+	// By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+	// encountering the first error.
+	// If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+	// if errors are detected for previous resources.
+	// +kubebuilder:default:=false
+	// +optional
+	ContinueOnError bool `json:"continueOnError,omitempty"`
+
 	// The maximum number of clusters that can be updated concurrently.
 	// Value can be an absolute number (ex: 5) or a percentage of desired cluster (ex: 10%).
 	// Defaults to 100%.

diff --git a/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml b/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml
@@ -1073,6 +1073,14 @@ spec:
                   If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                   if conflicts are detected for previous resources.
                 type: boolean
+              continueOnError:
+                default: false
+                description: |-
+                  By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                  encountering the first error.
+                  If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                  if errors are detected for previous resources.
+                type: boolean
               dependsOn:
                 description: |-
                   DependsOn specifies a list of other ClusterProfiles that this instance depends on.

diff --git a/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml b/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml
@@ -1099,6 +1099,14 @@ spec:
                       If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                       if conflicts are detected for previous resources.
                     type: boolean
+                  continueOnError:
+                    default: false
+                    description: |-
+                      By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                      encountering the first error.
+                      If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                      if errors are detected for previous resources.
+                    type: boolean
                   dependsOn:
                     description: |-
                       DependsOn specifies a list of other ClusterProfiles that this instance depends on.

diff --git a/config/crd/bases/config.projectsveltos.io_profiles.yaml b/config/crd/bases/config.projectsveltos.io_profiles.yaml
@@ -1073,6 +1073,14 @@ spec:
                   If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                   if conflicts are detected for previous resources.
                 type: boolean
+              continueOnError:
+                default: false
+                description: |-
+                  By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                  encountering the first error.
+                  If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                  if errors are detected for previous resources.
+                type: boolean
               dependsOn:
                 description: |-
                   DependsOn specifies a list of other ClusterProfiles that this instance depends on.

diff --git a/controllers/handlers_helm.go b/controllers/handlers_helm.go
@@ -515,6 +515,7 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c
 	kubeconfig string, mgmtResources map[string]*unstructured.Unstructured, logger logr.Logger,
 ) ([]configv1beta1.ReleaseReport, []configv1beta1.Chart, error) {
 
+	errorMsg := ""
 	conflictErrorMessage := ""
 	releaseReports := make([]configv1beta1.ReleaseReport, 0, len(clusterSummary.Spec.ClusterProfileSpec.HelmCharts))
 	chartDeployed := make([]configv1beta1.Chart, 0, len(clusterSummary.Spec.ClusterProfileSpec.HelmCharts))
@@ -560,6 +561,11 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c
 		var currentRelease *releaseInfo
 		currentRelease, report, err = handleChart(ctx, clusterSummary, mgmtResources, instantiatedChart, kubeconfig, logger)
 		if err != nil {
+			if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError {
+				errorMsg += fmt.Sprintf("chart: %s, release: %s, %v\n",
+					instantiatedChart.ChartName, instantiatedChart.ReleaseName, err)
+				continue
+			}
 			return releaseReports, chartDeployed, err
 		}
 
@@ -588,6 +594,12 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c
 		}
 	}
 
+	// This has to come before conflictErrorMessage as conflictErrorMessage is not retriable
+	// while any other generic error is
+	if errorMsg != "" {
+		return releaseReports, chartDeployed, fmt.Errorf("%s", errorMsg)
+	}
+
 	if conflictErrorMessage != "" {
 		// for helm chart a conflict is a non retriable error.
 		// when profile currently managing the helm chart is removed, all
@@ -1054,10 +1066,7 @@ func uninstallRelease(ctx context.Context, clusterSummary *configv1beta1.Cluster
 		return err
 	}
 
-	uninstallClient, err := getHelmUninstallClient(helmChart, actionConfig)
-	if err != nil {
-		return err
-	}
+	uninstallClient := getHelmUninstallClient(helmChart, actionConfig)
 
 	_, err = uninstallClient.Run(releaseName)
 	if err != nil {
@@ -1112,11 +1121,7 @@ func upgradeRelease(ctx context.Context, clusterSummary *configv1beta1.ClusterSu
 
 	patches = append(patches, driftExclusionPatches...)
 
-	upgradeClient, err := getHelmUpgradeClient(requestedChart, actionConfig, patches)
-	if err != nil {
-		logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to get helm upgrade client: %v", err))
-		return err
-	}
+	upgradeClient := getHelmUpgradeClient(requestedChart, actionConfig, patches)
 
 	cp, err := upgradeClient.ChartPathOptions.LocateChart(chartName, settings)
 	if err != nil {
@@ -2241,10 +2246,7 @@ func getHelmInstallClient(requestedChart *configv1beta1.HelmChart, kubeconfig st
 	installClient.DisableHooks = getDisableHooksHelmInstallValue(requestedChart.Options)
 	installClient.DisableOpenAPIValidation = getDisableOpenAPIValidationValue(requestedChart.Options)
 	if timeout := getTimeoutValue(requestedChart.Options); timeout != nil {
-		installClient.Timeout, err = time.ParseDuration(timeout.String())
-		if err != nil {
-			return nil, err
-		}
+		installClient.Timeout = timeout.Duration
 	}
 	installClient.SkipSchemaValidation = getSkipSchemaValidation(requestedChart.Options)
 	installClient.Replace = getReplaceValue(requestedChart.Options)
@@ -2262,7 +2264,7 @@ func getHelmInstallClient(requestedChart *configv1beta1.HelmChart, kubeconfig st
 }
 
 func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration,
-	patches []libsveltosv1beta1.Patch) (*action.Upgrade, error) {
+	patches []libsveltosv1beta1.Patch) *action.Upgrade {
 
 	upgradeClient := action.NewUpgrade(actionConfig)
 	upgradeClient.Install = true
@@ -2275,11 +2277,7 @@ func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig
 	upgradeClient.DisableHooks = getDisableHooksHelmUpgradeValue(requestedChart.Options)
 	upgradeClient.DisableOpenAPIValidation = getDisableOpenAPIValidationValue(requestedChart.Options)
 	if timeout := getTimeoutValue(requestedChart.Options); timeout != nil {
-		var err error
-		upgradeClient.Timeout, err = time.ParseDuration(timeout.String())
-		if err != nil {
-			return nil, err
-		}
+		upgradeClient.Timeout = timeout.Duration
 	}
 	upgradeClient.SkipSchemaValidation = getSkipSchemaValidation(requestedChart.Options)
 	upgradeClient.ResetValues = getResetValues(requestedChart.Options)
@@ -2301,19 +2299,17 @@ func getHelmUpgradeClient(requestedChart *configv1beta1.HelmChart, actionConfig
 		upgradeClient.PostRenderer = &patcher.CustomPatchPostRenderer{Patches: patches}
 	}
 
-	return upgradeClient, nil
+	return upgradeClient
 }
 
-func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration) (*action.Uninstall, error) {
+func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfig *action.Configuration,
+) *action.Uninstall {
+
 	uninstallClient := action.NewUninstall(actionConfig)
 	uninstallClient.DryRun = false
 	if requestedChart != nil {
 		if timeout := getTimeoutValue(requestedChart.Options); timeout != nil {
-			var err error
-			uninstallClient.Timeout, err = time.ParseDuration(timeout.String())
-			if err != nil {
-				return nil, err
-			}
+			uninstallClient.Timeout = timeout.Duration
 		}
 
 		uninstallClient.Description = getDescriptionValue(requestedChart.Options)
@@ -2322,7 +2318,7 @@ func getHelmUninstallClient(requestedChart *configv1beta1.HelmChart, actionConfi
 		uninstallClient.KeepHistory = getKeepHistoryValue(requestedChart.Options)
 		uninstallClient.DeletionPropagation = getDeletionPropagation(requestedChart.Options)
 	}
-	return uninstallClient, nil
+	return uninstallClient
 }
 
 func addExtraMetadata(ctx context.Context, requestedChart *configv1beta1.HelmChart,

diff --git a/controllers/handlers_utils.go b/controllers/handlers_utils.go
@@ -448,12 +448,17 @@ func deployUnstructured(ctx context.Context, deployingToMgmtCluster bool, destCo
 	}
 
 	conflictErrorMsg := ""
+	errorMsg := ""
 	reports = make([]configv1beta1.ResourceReport, 0)
 	for i := range referencedUnstructured {
 		policy := referencedUnstructured[i]
 
 		err := adjustNamespace(policy, destConfig)
 		if err != nil {
+			if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError {
+				errorMsg += fmt.Sprintf("%v", err)
+				continue
+			}
 			return nil, err
 		}
 
@@ -524,22 +529,38 @@ func deployUnstructured(ctx context.Context, deployingToMgmtCluster bool, destCo
 		}
 
 		policy, err = updateResource(ctx, dr, clusterSummary, policy, subresources, logger)
+		resource.LastAppliedTime = &metav1.Time{Time: time.Now()}
+		reports = append(reports, *generateResourceReport(policyHash, resourceInfo, policy, resource))
 		if err != nil {
+			if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError {
+				errorMsg += fmt.Sprintf("%v", err)
+				continue
+			}
 			return reports, err
 		}
-
-		resource.LastAppliedTime = &metav1.Time{Time: time.Now()}
-		reports = append(reports, *generateResourceReport(policyHash, resourceInfo, policy, resource))
 	}
 
+	return reports, handleDeployUnstructuredErrors(conflictErrorMsg, errorMsg, clusterSummary)
+}
+
+func handleDeployUnstructuredErrors(conflictErrorMsg, errorMsg string, clusterSummary *configv1beta1.ClusterSummary,
+) error {
+
 	if conflictErrorMsg != "" {
 		if clusterSummary.Spec.ClusterProfileSpec.SyncMode != configv1beta1.SyncModeDryRun {
 			// if in DryRun mode, ignore conflicts
-			return reports, deployer.NewConflictError(conflictErrorMsg)
+			return deployer.NewConflictError(conflictErrorMsg)
 		}
 	}
 
-	return reports, nil
+	if errorMsg != "" {
+		if clusterSummary.Spec.ClusterProfileSpec.SyncMode != configv1beta1.SyncModeDryRun {
+			// if in DryRun mode, ignore errors
+			return fmt.Errorf("%s", errorMsg)
+		}
+	}
+
+	return nil
 }
 
 func addMetadata(policy *unstructured.Unstructured, resourceVersion string, profile client.Object,

diff --git a/manifest/manifest.yaml b/manifest/manifest.yaml
@@ -1854,6 +1854,14 @@ spec:
                   If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                   if conflicts are detected for previous resources.
                 type: boolean
+              continueOnError:
+                default: false
+                description: |-
+                  By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                  encountering the first error.
+                  If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                  if errors are detected for previous resources.
+                type: boolean
               dependsOn:
                 description: |-
                   DependsOn specifies a list of other ClusterProfiles that this instance depends on.
@@ -4688,6 +4696,14 @@ spec:
                       If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                       if conflicts are detected for previous resources.
                     type: boolean
+                  continueOnError:
+                    default: false
+                    description: |-
+                      By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                      encountering the first error.
+                      If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                      if errors are detected for previous resources.
+                    type: boolean
                   dependsOn:
                     description: |-
                       DependsOn specifies a list of other ClusterProfiles that this instance depends on.
@@ -6829,6 +6845,14 @@ spec:
                   If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
                   if conflicts are detected for previous resources.
                 type: boolean
+              continueOnError:
+                default: false
+                description: |-
+                  By default (when ContinueOnError is unset or set to false), Sveltos stops deployment after
+                  encountering the first error.
+                  If set to true, Sveltos will attempt to deploy remaining resources in the ClusterProfile even
+                  if errors are detected for previous resources.
+                type: boolean
               dependsOn:
                 description: |-
                   DependsOn specifies a list of other ClusterProfiles that this instance depends on.