diff --git a/controllers/secretproviderclasspodstatus_controller.go b/controllers/secretproviderclasspodstatus_controller.go index 8fbee8c14..5687d054d 100644 --- a/controllers/secretproviderclasspodstatus_controller.go +++ b/controllers/secretproviderclasspodstatus_controller.go @@ -65,6 +65,7 @@ type SecretProviderClassPodStatusReconciler struct { writer client.Writer eventRecorder record.EventRecorder driverName string + reporter StatsReporter } // New creates a new SecretProviderClassPodStatusReconciler @@ -73,6 +74,10 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider kubeClient := kubernetes.NewForConfigOrDie(mgr.GetConfig()) eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "csi-secrets-store-controller"}) + sr, err := newStatsReporter() + if err != nil { + return nil, err + } return &SecretProviderClassPodStatusReconciler{ Client: mgr.GetClient(), @@ -83,6 +88,7 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider writer: mgr.GetClient(), eventRecorder: recorder, driverName: driverName, + reporter: sr, }, nil } @@ -266,6 +272,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context, return ctrl.Result{}, nil } + // if SecretObjects defined in the SPC, record the time to report sync_k8s_secret_duration_sec metric + begin := time.Now() + // determine which pod volume this is associated with podVol := k8sutil.SPCVolume(pod, r.driverName, spc.Name) if podVol == nil { @@ -365,6 +374,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context, return ctrl.Result{Requeue: true}, nil } + r.reporter.ReportSyncSecretCtMetric(ctx, string(spc.Spec.Provider), spcPodStatus.Namespace, spc.Name) + r.reporter.ReportSyncSecretDuration(ctx, time.Since(begin).Seconds()) + klog.InfoS("reconcile complete", "spc", klog.KObj(spc), "pod", klog.KObj(pod), "spcps", klog.KObj(spcPodStatus)) // requeue the spc pod status again after 5mins to check if secret and ownerRef exists // and haven't been modified. If secret doesn't exist, then this requeue will ensure it's diff --git a/controllers/stats_reporter.go b/controllers/stats_reporter.go new file mode 100644 index 000000000..c5413df3f --- /dev/null +++ b/controllers/stats_reporter.go @@ -0,0 +1,80 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "runtime" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/global" +) + +const ( + scope = "sigs.k8s.io/secrets-store-csi-driver" +) + +var ( + providerKey = "provider" + osTypeKey = "os_type" + runtimeOS = runtime.GOOS + namespaceKey = "namespace" + spcKey = "secret_provider_class" +) + +type reporter struct { + syncK8sSecretTotal metric.Int64Counter + syncK8sSecretDuration metric.Float64Histogram +} + +type StatsReporter interface { + ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string) + ReportSyncSecretDuration(ctx context.Context, duration float64) +} + +func newStatsReporter() (StatsReporter, error) { + var err error + + r := &reporter{} + meter := global.Meter(scope) + + if r.syncK8sSecretTotal, err = meter.Int64Counter("sync_k8s_secret", metric.WithDescription("Total number of k8s secrets synced")); err != nil { + return nil, err + } + if r.syncK8sSecretDuration, err = meter.Float64Histogram("sync_k8s_secret_duration_sec", metric.WithDescription("Distribution of how long it took to sync k8s secret")); err != nil { + return nil, err + } + return r, nil +} + +func (r reporter) ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string) { + opt := metric.WithAttributes( + attribute.Key(providerKey).String(provider), + attribute.Key(osTypeKey).String(runtimeOS), + attribute.Key(namespaceKey).String(namespace), + attribute.Key(spcKey).String(spc), + ) + r.syncK8sSecretTotal.Add(ctx, 1, opt) +} + +func (r reporter) ReportSyncSecretDuration(ctx context.Context, duration float64) { + opt := metric.WithAttributes( + attribute.Key(osTypeKey).String(runtimeOS), + ) + r.syncK8sSecretDuration.Record(ctx, duration, opt) +} diff --git a/docs/book/src/topics/metrics.md b/docs/book/src/topics/metrics.md index a3a2f67c6..675dd6366 100644 --- a/docs/book/src/topics/metrics.md +++ b/docs/book/src/topics/metrics.md @@ -6,17 +6,17 @@ Prometheus is the only exporter that's currently supported with the driver. ## List of metrics provided by the driver -| Metric | Description | Tags | -| ------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------- | -| total_node_publish | Total number of successful volume mount requests | `os_type=`
`provider=` | -| total_node_unpublish | Total number of successful volume unmount requests | `os_type=` | -| total_node_publish_error | Total number of errors with volume mount requests | `os_type=`
`provider=`
`error_type=` | -| total_node_unpublish_error | Total number of errors with volume unmount requests | `os_type=` | -| total_sync_k8s_secret | Total number of k8s secrets synced | `os_type=`
`provider=` | -| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=` | -| total_rotation_reconcile | Total number of rotation reconciles | `os_type=`
`rotated=` | -| total_rotation_reconcile_error | Total number of rotation reconciles with error | `os_type=`
`rotated=`
`error_type=` | -| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=` | +| Metric | Description | Tags | +|---------------------------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| node_publish_total | Total number of successful volume mount requests | `os_type=`
`provider=`
`pod_name=`
`pod_namespace=`
`secret_provider_class=` | +| node_unpublish_total | Total number of successful volume unmount requests | `os_type=` | +| node_publish_error_total | Total number of errors with volume mount requests | `os_type=`
`provider=`
`error_type=`
`pod_name=`
`pod_namespace=`
`secret_provider_class=` | +| node_unpublish_error_total | Total number of errors with volume unmount requests | `os_type=` | +| sync_k8s_secret_total | Total number of k8s secrets synced | `os_type=`
`provider=`
`namespace=`
`secret_provider_class=` | +| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=` | +| rotation_reconcile_total | Total number of rotation reconciles | `os_type=`
`rotated=`
`pod_name=`
`pod_namespace=`
`secret_provider_class=` | +| rotation_reconcile_error_total | Total number of rotation reconciles with error | `os_type=`
`rotated=`
`error_type=`
`pod_name=`
`pod_namespace=`
`secret_provider_class=` | +| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=`
`pod_name=`
`pod_namespace=`
`secret_provider_class=` | Metrics are served from port 8095, but this port is not exposed outside the pod by default. Use kubectl port-forward to access the metrics over localhost: @@ -47,17 +47,43 @@ sync_k8s_secret_duration_sec_bucket{os_type="linux",le="30"} 1 sync_k8s_secret_duration_sec_bucket{os_type="linux",le="+Inf"} 1 sync_k8s_secret_duration_sec_sum{os_type="linux"} 0.3115892 sync_k8s_secret_duration_sec_count{os_type="linux"} 1 -# HELP total_node_publish Total number of node publish calls -# TYPE total_node_publish counter -total_node_publish{os_type="linux",provider="azure"} 1 -# HELP total_node_publish_error Total number of node publish calls with error -# TYPE total_node_publish_error counter -total_node_publish_error{error_type="ProviderBinaryNotFound",os_type="linux",provider="azure"} 2 -total_node_publish_error{error_type="SecretProviderClassNotFound",os_type="linux",provider=""} 4 -# HELP total_node_unpublish Total number of node unpublish calls -# TYPE total_node_unpublish counter -total_node_unpublish{os_type="linux"} 1 -# HELP total_sync_k8s_secret Total number of k8s secrets synced -# TYPE total_sync_k8s_secret counter -total_sync_k8s_secret{os_type="linux",provider="azure"} 1 -``` + +# HELP sync_k8s_secret_total Total number of k8s secrets synced +# TYPE sync_k8s_secret_total counter +sync_k8s_secret_total{namespace="csi-test-secret-ns",os_type="linux",provider="azure",secret_provider_class="csi-test-spc"} 1 + +# HELP rotation_reconcile_duration_sec Distribution of how long it took to rotate secrets-store content for pods +# TYPE rotation_reconcile_duration_sec histogram +rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.1"} 0 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.2"} 0 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.3"} 0 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.4"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.5"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="1"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="1.5"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="2"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="2.5"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="3"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="5"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="10"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="15"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="30"} 1 +rotation_reconcile_duration_sec_bucket{os_type="linux",le="+Inf"} 1 +rotation_reconcile_duration_sec_sum{os_type="linux",} 0.3115892 +rotation_reconcile_duration_sec_count{os_type="linux"} 1 +# HELP rotation_reconcile_total Total number of rotation reconciles +# TYPE rotation_reconcile_total counter +rotation_reconcile_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 1 +# HELP rotation_reconcile_error_total Total number of rotation reconciles with error +# TYPE rotation_reconcile_error_total counter +rotation_reconcile_error_total{error_type="GRPCProviderError",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 12 +# HELP node_publish_total Total number of node publish calls +# TYPE node_publish_total counter +node_publish_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 1 +# HELP node_publish_error_total Total number of node publish calls with error +# TYPE node_publish_error_total counter +node_publish_error_total{error_type="ProviderBinaryNotFound",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 7 +# HELP node_unpublish_total Total number of node unpublish calls +# TYPE node_unpublish_total counter +node_unpublish_total{os_type="linux"} 1 +``` \ No newline at end of file diff --git a/pkg/rotation/reconciler.go b/pkg/rotation/reconciler.go index e82d12f79..8e154a98a 100644 --- a/pkg/rotation/reconciler.go +++ b/pkg/rotation/reconciler.go @@ -251,13 +251,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret // after the provider mount request is complete var requiresUpdate bool var providerName string + podName := spcps.Status.PodName + podNamespace := spcps.Namespace + secretProviderClass := spcps.Status.SecretProviderClassName defer func() { if err != nil { - r.reporter.reportRotationErrorCtMetric(ctx, providerName, errorReason, requiresUpdate) + r.reporter.reportRotationErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason, requiresUpdate) return } - r.reporter.reportRotationCtMetric(ctx, providerName, requiresUpdate) + r.reporter.reportRotationCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, requiresUpdate) r.reporter.reportRotationDuration(ctx, time.Since(begin).Seconds()) }() @@ -266,14 +269,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret err = r.cache.Get( ctx, client.ObjectKey{ - Namespace: spcps.Namespace, - Name: spcps.Status.PodName, + Namespace: podNamespace, + Name: podName, }, pod, ) if err != nil { errorReason = internalerrors.PodNotFound - return fmt.Errorf("failed to get pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err) + return fmt.Errorf("failed to get pod %s/%s, err: %w", podNamespace, podName, err) } // skip rotation if the pod is being terminated // or the pod is in succeeded state (for jobs that complete aren't gc yet) @@ -289,14 +292,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret err = r.cache.Get( ctx, client.ObjectKey{ - Namespace: spcps.Namespace, - Name: spcps.Status.SecretProviderClassName, + Namespace: podNamespace, + Name: secretProviderClass, }, spc, ) if err != nil { errorReason = internalerrors.SecretProviderClassNotFound - return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", spcps.Namespace, spcps.Status.SecretProviderClassName, err) + return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", podNamespace, secretProviderClass, err) } // determine which pod volume this is associated with @@ -359,16 +362,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret // This comprises the secret parameter in the MountRequest to the provider if nodePublishSecretRef != nil { // read secret from the informer cache - secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, spcps.Namespace) + secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, podNamespace) if err != nil { if apierrors.IsNotFound(err) { klog.ErrorS(err, - fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, spcps.Namespace), - "name", nodePublishSecretRef.Name, "namespace", spcps.Namespace) + fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, podNamespace), + "name", nodePublishSecretRef.Name, "namespace", podNamespace) } errorReason = internalerrors.NodePublishSecretRefNotFound - r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", spcps.Namespace, nodePublishSecretRef.Name, err)) - return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", spcps.Namespace, nodePublishSecretRef.Name, err) + r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", podNamespace, nodePublishSecretRef.Name, err)) + return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", podNamespace, nodePublishSecretRef.Name, err) } for k, v := range secret.Data { @@ -401,7 +404,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret newObjectVersions, errorReason, err := secretsstore.MountContent(ctx, providerClient, string(paramsJSON), string(secretsJSON), spcps.Status.TargetPath, string(permissionJSON), oldObjectVersions) if err != nil { r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("provider mount err: %+v", err)) - return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err) + return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", podNamespace, podName, err) } // compare the old object versions and new object versions to check if any of the objects @@ -488,7 +491,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret patchFn := func() (bool, error) { // patch secret data with the new contents - if err := r.patchSecret(ctx, secretObj.SecretName, spcps.Namespace, datamap); err != nil { + if err := r.patchSecret(ctx, secretObj.SecretName, podNamespace, datamap); err != nil { // syncSecret.enabled is set to false by default in the helm chart for installing the driver in v0.0.23+ // that would result in a forbidden error, so generate a warning that can be helpful for debugging if apierrors.IsForbidden(err) { diff --git a/pkg/rotation/stats_reporter.go b/pkg/rotation/stats_reporter.go index bc3fa8066..3d173e25f 100644 --- a/pkg/rotation/stats_reporter.go +++ b/pkg/rotation/stats_reporter.go @@ -30,11 +30,14 @@ const ( ) var ( - providerKey = "provider" - errorKey = "error_type" - osTypeKey = "os_type" - rotatedKey = "rotated" - runtimeOS = runtime.GOOS + providerKey = "provider" + errorKey = "error_type" + osTypeKey = "os_type" + rotatedKey = "rotated" + runtimeOS = runtime.GOOS + podNameKey = "pod_name" + podNamespaceKey = "pod_namespace" + spcKey = "secret_provider_class" ) type reporter struct { @@ -44,8 +47,8 @@ type reporter struct { } type StatsReporter interface { - reportRotationCtMetric(ctx context.Context, provider string, wasRotated bool) - reportRotationErrorCtMetric(ctx context.Context, provider, errType string, wasRotated bool) + reportRotationCtMetric(ctx context.Context, provider, podName, podNamespace, spc string, wasRotated bool) + reportRotationErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string, wasRotated bool) reportRotationDuration(ctx context.Context, duration float64) } @@ -67,21 +70,27 @@ func newStatsReporter() (StatsReporter, error) { return r, nil } -func (r *reporter) reportRotationCtMetric(ctx context.Context, provider string, wasRotated bool) { +func (r *reporter) reportRotationCtMetric(ctx context.Context, provider, podName, podNamespace, spc string, wasRotated bool) { opt := metric.WithAttributes( attribute.Key(providerKey).String(provider), attribute.Key(osTypeKey).String(runtimeOS), attribute.Key(rotatedKey).Bool(wasRotated), + attribute.Key(podNameKey).String(podName), + attribute.Key(podNamespaceKey).String(podNamespace), + attribute.Key(spcKey).String(spc), ) r.rotationReconcileTotal.Add(ctx, 1, opt) } -func (r *reporter) reportRotationErrorCtMetric(ctx context.Context, provider, errType string, wasRotated bool) { +func (r *reporter) reportRotationErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string, wasRotated bool) { opt := metric.WithAttributes( attribute.Key(providerKey).String(provider), attribute.Key(errorKey).String(errType), attribute.Key(osTypeKey).String(runtimeOS), attribute.Key(rotatedKey).Bool(wasRotated), + attribute.Key(podNameKey).String(podName), + attribute.Key(podNamespaceKey).String(podNamespace), + attribute.Key(spcKey).String(spc), ) r.rotationReconcileErrorTotal.Add(ctx, 1, opt) } diff --git a/pkg/secrets-store/mocks/stats_reporter_mock.go b/pkg/secrets-store/mocks/stats_reporter_mock.go index 669674adb..c2aa00722 100644 --- a/pkg/secrets-store/mocks/stats_reporter_mock.go +++ b/pkg/secrets-store/mocks/stats_reporter_mock.go @@ -18,43 +18,57 @@ package mocks // import sigs.k8s.io/secrets-store-csi-driver/pkg/secrets-store/m import "context" +type MetricDetails struct { + Provider string + PodName string + PodNamespace string + Spc string + ErrorType string +} + type FakeReporter struct { reportNodePublishCtMetricInvoked int reportNodeUnPublishCtMetricInvoked int reportNodePublishErrorCtMetricInvoked int reportNodeUnPublishErrorCtMetricInvoked int - reportSyncK8SecretCtMetricInvoked int - reportSyncK8SecretDurationInvoked int + metricDetails []MetricDetails } func NewFakeReporter() *FakeReporter { - return &FakeReporter{} + return &FakeReporter{ + metricDetails: []MetricDetails{}, + } } -func (f *FakeReporter) ReportNodePublishCtMetric(ctx context.Context, provider string) { +func (f *FakeReporter) ReportNodePublishCtMetric(ctx context.Context, provider, podName, podNamespace, spc string) { f.reportNodePublishCtMetricInvoked++ + f.metricDetails = append(f.metricDetails, MetricDetails{ + Provider: provider, + PodName: podName, + PodNamespace: podNamespace, + Spc: spc, + }) } func (f *FakeReporter) ReportNodeUnPublishCtMetric(ctx context.Context) { f.reportNodeUnPublishCtMetricInvoked++ } -func (f *FakeReporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, errType string) { +func (f *FakeReporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string) { f.reportNodePublishErrorCtMetricInvoked++ + f.metricDetails = append(f.metricDetails, MetricDetails{ + Provider: provider, + PodName: podName, + PodNamespace: podNamespace, + Spc: spc, + ErrorType: errType, + }) } func (f *FakeReporter) ReportNodeUnPublishErrorCtMetric(ctx context.Context) { f.reportNodeUnPublishErrorCtMetricInvoked++ } -func (f *FakeReporter) ReportSyncK8SecretCtMetric(ctx context.Context, provider string, count int) { - f.reportSyncK8SecretCtMetricInvoked++ -} - -func (f *FakeReporter) ReportSyncK8SecretDuration(ctx context.Context, duration float64) { - f.reportSyncK8SecretDurationInvoked++ -} - func (f *FakeReporter) ReportNodePublishCtMetricInvoked() int { return f.reportNodePublishCtMetricInvoked } @@ -67,9 +81,7 @@ func (f *FakeReporter) ReportNodePublishErrorCtMetricInvoked() int { func (f *FakeReporter) ReportNodeUnPublishErrorCtMetricInvoked() int { return f.reportNodeUnPublishErrorCtMetricInvoked } -func (f *FakeReporter) ReportSyncK8SecretCtMetricInvoked() int { - return f.reportSyncK8SecretCtMetricInvoked -} -func (f *FakeReporter) ReportSyncK8SecretDurationInvoked() int { - return f.reportSyncK8SecretDurationInvoked + +func (f *FakeReporter) GetMetricDetails() []MetricDetails { + return f.metricDetails } diff --git a/pkg/secrets-store/nodeserver.go b/pkg/secrets-store/nodeserver.go index e2af8dcd0..88558d0c9 100644 --- a/pkg/secrets-store/nodeserver.go +++ b/pkg/secrets-store/nodeserver.go @@ -73,7 +73,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis startTime := time.Now() var parameters map[string]string var providerName string - var podName, podNamespace, podUID, serviceAccountName string + var podName, podNamespace, podUID, serviceAccountName, secretProviderClass string var targetPath string var mounted bool errorReason := internalerrors.FailedToMount @@ -89,10 +89,10 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis klog.ErrorS(unmountErr, "failed to unmounting target path") } } - ns.reporter.ReportNodePublishErrorCtMetric(ctx, providerName, errorReason) + ns.reporter.ReportNodePublishErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason) return } - ns.reporter.ReportNodePublishCtMetric(ctx, providerName) + ns.reporter.ReportNodePublishCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass) }() // Check arguments @@ -115,7 +115,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis mountFlags := req.GetVolumeCapability().GetMount().GetMountFlags() secrets := req.GetSecrets() - secretProviderClass := attrib[secretProviderClassField] + secretProviderClass = attrib[secretProviderClassField] providerName = attrib["providerName"] podName = attrib[CSIPodName] podNamespace = attrib[CSIPodNamespace] diff --git a/pkg/secrets-store/nodeserver_test.go b/pkg/secrets-store/nodeserver_test.go index cef12021f..2715085b1 100644 --- a/pkg/secrets-store/nodeserver_test.go +++ b/pkg/secrets-store/nodeserver_test.go @@ -20,10 +20,12 @@ import ( "context" "os" "path/filepath" + "reflect" "testing" "time" secretsstorev1 "sigs.k8s.io/secrets-store-csi-driver/apis/v1" + internalerrors "sigs.k8s.io/secrets-store-csi-driver/pkg/errors" "sigs.k8s.io/secrets-store-csi-driver/pkg/k8s" "sigs.k8s.io/secrets-store-csi-driver/pkg/secrets-store/mocks" providerfake "sigs.k8s.io/secrets-store-csi-driver/provider/fake" @@ -65,18 +67,21 @@ func TestNodePublishVolume_Errors(t *testing.T) { nodePublishVolReq *csi.NodePublishVolumeRequest initObjects []client.Object want codes.Code + metricDetails []mocks.MetricDetails }{ { name: "volume capabilities nil", nodePublishVolReq: &csi.NodePublishVolumeRequest{}, want: codes.InvalidArgument, + metricDetails: []mocks.MetricDetails{{ErrorType: internalerrors.FailedToMount}, {ErrorType: internalerrors.FailedToMount}}, }, { name: "volume id is empty", nodePublishVolReq: &csi.NodePublishVolumeRequest{ VolumeCapability: &csi.VolumeCapability{}, }, - want: codes.InvalidArgument, + want: codes.InvalidArgument, + metricDetails: []mocks.MetricDetails{{ErrorType: internalerrors.FailedToMount}, {ErrorType: internalerrors.FailedToMount}}, }, { name: "target path is empty", @@ -84,7 +89,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { VolumeCapability: &csi.VolumeCapability{}, VolumeId: "testvolid1", }, - want: codes.InvalidArgument, + want: codes.InvalidArgument, + metricDetails: []mocks.MetricDetails{{ErrorType: internalerrors.FailedToMount}, {ErrorType: internalerrors.FailedToMount}}, }, { name: "volume context is not set", @@ -93,7 +99,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { VolumeId: "testvolid1", TargetPath: targetPath(t), }, - want: codes.InvalidArgument, + want: codes.InvalidArgument, + metricDetails: []mocks.MetricDetails{{ErrorType: internalerrors.FailedToMount}, {ErrorType: internalerrors.FailedToMount}}, }, { name: "secret provider class not found", @@ -103,7 +110,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { TargetPath: targetPath(t), VolumeContext: map[string]string{"secretProviderClass": "provider1"}, }, - want: codes.Unknown, + want: codes.Unknown, + metricDetails: []mocks.MetricDetails{{Spc: "provider1", ErrorType: internalerrors.SecretProviderClassNotFound}, {Spc: "provider1", ErrorType: internalerrors.SecretProviderClassNotFound}}, }, { name: "spc missing", @@ -126,7 +134,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { }, }, }, - want: codes.Unknown, + want: codes.Unknown, + metricDetails: []mocks.MetricDetails{{Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.SecretProviderClassNotFound}, {Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.SecretProviderClassNotFound}}, }, { name: "provider not set in secret provider class", @@ -144,7 +153,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { }, }, }, - want: codes.Unknown, + want: codes.Unknown, + metricDetails: []mocks.MetricDetails{{PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}, {PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}}, }, { name: "parameters not set in secret provider class", @@ -165,7 +175,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { }, }, }, - want: codes.Unknown, + want: codes.Unknown, + metricDetails: []mocks.MetricDetails{{Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}, {Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}}, }, { name: "read only is not set to true", @@ -187,7 +198,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { }, }, }, - want: codes.InvalidArgument, + want: codes.InvalidArgument, + metricDetails: []mocks.MetricDetails{{Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}, {Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1", ErrorType: internalerrors.FailedToMount}}, }, { name: "provider not installed", @@ -215,7 +227,8 @@ func TestNodePublishVolume_Errors(t *testing.T) { }, }, }, - want: codes.Unknown, + want: codes.Unknown, + metricDetails: []mocks.MetricDetails{{Provider: "provider_not_installed", PodName: "pod1", PodNamespace: "default", Spc: "provider1"}, {Provider: "provider_not_installed", PodName: "pod1", PodNamespace: "default", Spc: "provider1"}}, }, } @@ -236,7 +249,7 @@ func TestNodePublishVolume_Errors(t *testing.T) { } for attempts := 2; attempts > 0; attempts-- { - // How many times 'total_node_publish' and 'total_node_publish_error' counters have been incremented so far. + // How many times 'node_publish_total' and 'node_publish_total_error' counters have been incremented so far. c, cErr := r.ReportNodePublishCtMetricInvoked(), r.ReportNodePublishErrorCtMetricInvoked() _, err = ns.NodePublishVolume(context.TODO(), test.nodePublishVolReq) @@ -246,10 +259,14 @@ func TestNodePublishVolume_Errors(t *testing.T) { // Check that the correct counter has been incremented. if err != nil && r.ReportNodePublishErrorCtMetricInvoked() <= cErr { - t.Fatalf("expected 'total_node_publish_error' counter to be incremented, but it was not") + t.Fatalf("expected 'node_publish_total_error' counter to be incremented, but it was not") } if err == nil && r.ReportNodePublishCtMetricInvoked() <= c { - t.Fatalf("expected 'total_node_publish' counter to be incremented, but it was not") + t.Fatalf("expected 'node_publish_total' counter to be incremented, but it was not") + } + metricDetails := r.GetMetricDetails() + if !reflect.DeepEqual(metricDetails[2-attempts], test.metricDetails[2-attempts]) { + t.Fatalf("expected publish details to be %+v, but it was %+v", test.metricDetails[2-attempts], metricDetails[2-attempts]) } // if theres not an error we should check the mount w @@ -270,6 +287,7 @@ func TestNodePublishVolume(t *testing.T) { name string nodePublishVolReq *csi.NodePublishVolumeRequest initObjects []client.Object + metricDetails []mocks.MetricDetails }{ { name: "volume mount", @@ -297,6 +315,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, }, + metricDetails: []mocks.MetricDetails{{Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1"}, {PodName: "pod1", PodNamespace: "default", Spc: "provider1"}}, }, { name: "volume mount with refresh token", @@ -327,6 +346,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, }, + metricDetails: []mocks.MetricDetails{{Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1"}, {Provider: "provider1", PodName: "pod1", PodNamespace: "default", Spc: "provider1"}}, }, } @@ -347,20 +367,29 @@ func TestNodePublishVolume(t *testing.T) { } for attempts := 2; attempts > 0; attempts-- { - // How many times 'total_node_publish' and 'total_node_publish_error' counters have been incremented so far. + // How many times 'node_publish_total' and 'node_publish_total_error' counters have been incremented so far. c, cErr := r.ReportNodePublishCtMetricInvoked(), r.ReportNodePublishErrorCtMetricInvoked() - _, err = ns.NodePublishVolume(context.TODO(), test.nodePublishVolReq) if code := status.Code(err); code != codes.OK { t.Errorf("expected RPC status code: %v, got: %v\n", codes.OK, err) } + metricDetails := r.GetMetricDetails() // Check that the correct counter has been incremented. if err != nil && r.ReportNodePublishErrorCtMetricInvoked() <= cErr { - t.Fatalf("expected 'total_node_publish_error' counter to be incremented, but it was not") + t.Fatalf("expected 'node_publish_total_error' counter to be incremented, but it was not") } if err == nil && r.ReportNodePublishCtMetricInvoked() <= c { - t.Fatalf("expected 'total_node_publish' counter to be incremented, but it was not") + t.Fatalf("expected 'node_publish_total' counter to be incremented, but it was not") + } + if !reflect.DeepEqual(metricDetails[2-attempts], test.metricDetails[2-attempts]) { + t.Fatalf("expected publish details to be %+v, but it was %+v", test.metricDetails[2-attempts], metricDetails[2-attempts]) + } + if !reflect.DeepEqual(metricDetails[2-attempts], test.metricDetails[2-attempts]) { + t.Fatalf("expected publish details to be %+v, but it was %+v", test.metricDetails[2-attempts], metricDetails[2-attempts]) + } + if !reflect.DeepEqual(metricDetails[2-attempts], test.metricDetails[2-attempts]) { + t.Fatalf("expected publish details to be %+v, but it was %+v", test.metricDetails[2-attempts], metricDetails[2-attempts]) } // if theres not an error we should check the mount w @@ -407,7 +436,7 @@ func TestNodeUnpublishVolume(t *testing.T) { // Repeat the request multiple times to ensure it consistently returns OK, // even if it has already been unmounted. for attempts := 2; attempts > 0; attempts-- { - // How many times 'total_node_unpublish' and 'total_node_unpublish_error' counters have been incremented so far + // How many times 'node_unpublish_total' and 'node_unpublish_total_error' counters have been incremented so far c, cErr := r.ReportNodeUnPublishCtMetricInvoked(), r.ReportNodeUnPublishErrorCtMetricInvoked() _, err := ns.NodeUnpublishVolume(context.TODO(), req) @@ -417,10 +446,10 @@ func TestNodeUnpublishVolume(t *testing.T) { // Check that the correct counter has been incremented if err != nil && r.ReportNodeUnPublishErrorCtMetricInvoked() <= cErr { - t.Fatalf("expected 'total_node_unpublish_error' counter to be incremented, but it was not") + t.Fatalf("expected 'node_unpublish_total_error' counter to be incremented, but it was not") } if err == nil && r.ReportNodeUnPublishCtMetricInvoked() <= c { - t.Fatalf("expected 'total_node_unpublish' counter to be incremented, but it was not") + t.Fatalf("expected 'node_unpublish_total' counter to be incremented, but it was not") } // Ensure that the mounts were unmounted. @@ -469,7 +498,7 @@ func TestNodeUnpublishVolume_Error(t *testing.T) { } for attempts := 2; attempts > 0; attempts-- { - // How many times 'total_node_unpublish' and 'total_node_unpublish_error' counters have been incremented so far + // How many times 'node_unpublish_total' and 'node_unpublish_total_error' counters have been incremented so far c, cErr := r.ReportNodeUnPublishCtMetricInvoked(), r.ReportNodeUnPublishErrorCtMetricInvoked() _, err := ns.NodeUnpublishVolume(context.TODO(), test.nodeUnpublishVolReq) @@ -479,10 +508,10 @@ func TestNodeUnpublishVolume_Error(t *testing.T) { // Check that the correct counter has been incremented if err != nil && r.ReportNodeUnPublishErrorCtMetricInvoked() <= cErr { - t.Fatalf("expected 'total_node_unpublish_error' counter to be incremented, but it was not") + t.Fatalf("expected 'node_unpublish_total_error' counter to be incremented, but it was not") } if err == nil && r.ReportNodeUnPublishCtMetricInvoked() <= c { - t.Fatalf("expected 'total_node_unpublish' counter to be incremented, but it was not") + t.Fatalf("expected 'node_unpublish_total' counter to be incremented, but it was not") } } }) diff --git a/pkg/secrets-store/stats_reporter.go b/pkg/secrets-store/stats_reporter.go index 07447eb88..a8559c913 100644 --- a/pkg/secrets-store/stats_reporter.go +++ b/pkg/secrets-store/stats_reporter.go @@ -30,10 +30,13 @@ const ( ) var ( - providerKey = "provider" - errorKey = "error_type" - osTypeKey = "os_type" - runtimeOS = runtime.GOOS + providerKey = "provider" + errorKey = "error_type" + osTypeKey = "os_type" + runtimeOS = runtime.GOOS + podNameKey = "pod_name" + podNamespaceKey = "pod_namespace" + spcKey = "secret_provider_class" ) type reporter struct { @@ -41,17 +44,13 @@ type reporter struct { nodeUnPublishTotal metric.Int64Counter nodePublishErrorTotal metric.Int64Counter nodeUnPublishErrorTotal metric.Int64Counter - syncK8sSecretTotal metric.Int64Counter - syncK8sSecretDuration metric.Float64Histogram } type StatsReporter interface { - ReportNodePublishCtMetric(ctx context.Context, provider string) + ReportNodePublishCtMetric(ctx context.Context, provider, podName, podNamespace, spc string) ReportNodeUnPublishCtMetric(ctx context.Context) - ReportNodePublishErrorCtMetric(ctx context.Context, provider, errType string) + ReportNodePublishErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string) ReportNodeUnPublishErrorCtMetric(ctx context.Context) - ReportSyncK8SecretCtMetric(ctx context.Context, provider string, count int) - ReportSyncK8SecretDuration(ctx context.Context, duration float64) } func NewStatsReporter() (StatsReporter, error) { @@ -72,19 +71,16 @@ func NewStatsReporter() (StatsReporter, error) { if r.nodeUnPublishErrorTotal, err = meter.Int64Counter("node_unpublish_error", metric.WithDescription("Total number of node unpublish calls with error")); err != nil { return nil, err } - if r.syncK8sSecretTotal, err = meter.Int64Counter("sync_k8s_secret", metric.WithDescription("Total number of k8s secrets synced")); err != nil { - return nil, err - } - if r.syncK8sSecretDuration, err = meter.Float64Histogram("k8s_secret_duration_sec", metric.WithDescription("Distribution of how long it took to sync k8s secret")); err != nil { - return nil, err - } return r, nil } -func (r *reporter) ReportNodePublishCtMetric(ctx context.Context, provider string) { +func (r *reporter) ReportNodePublishCtMetric(ctx context.Context, provider, podName, podNamespace, spc string) { opt := metric.WithAttributes( attribute.Key(providerKey).String(provider), attribute.Key(osTypeKey).String(runtimeOS), + attribute.Key(podNameKey).String(podName), + attribute.Key(podNamespaceKey).String(podNamespace), + attribute.Key(spcKey).String(spc), ) r.nodePublishTotal.Add(ctx, 1, opt) } @@ -96,11 +92,14 @@ func (r *reporter) ReportNodeUnPublishCtMetric(ctx context.Context) { r.nodeUnPublishTotal.Add(ctx, 1, opt) } -func (r *reporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, errType string) { +func (r *reporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string) { opt := metric.WithAttributes( attribute.Key(providerKey).String(provider), attribute.Key(errorKey).String(errType), attribute.Key(osTypeKey).String(runtimeOS), + attribute.Key(podNameKey).String(podName), + attribute.Key(podNamespaceKey).String(podNamespace), + attribute.Key(spcKey).String(spc), ) r.nodePublishErrorTotal.Add(ctx, 1, opt) } @@ -111,18 +110,3 @@ func (r *reporter) ReportNodeUnPublishErrorCtMetric(ctx context.Context) { ) r.nodeUnPublishErrorTotal.Add(ctx, 1, opt) } - -func (r *reporter) ReportSyncK8SecretCtMetric(ctx context.Context, provider string, count int) { - opt := metric.WithAttributes( - attribute.Key(providerKey).String(provider), - attribute.Key(osTypeKey).String(runtimeOS), - ) - r.syncK8sSecretTotal.Add(ctx, int64(count), opt) -} - -func (r *reporter) ReportSyncK8SecretDuration(ctx context.Context, duration float64) { - opt := metric.WithAttributes( - attribute.Key(osTypeKey).String(runtimeOS), - ) - r.syncK8sSecretDuration.Record(ctx, duration, opt) -} diff --git a/test/bats/e2e-provider.bats b/test/bats/e2e-provider.bats index f3e45c650..1f95a13ea 100644 --- a/test/bats/e2e-provider.bats +++ b/test/bats/e2e-provider.bats @@ -428,6 +428,7 @@ export VALIDATE_TOKENS_AUDIENCE=$(get_token_requests_audience) assert_match "node_publish_total" "${output}" assert_match "node_unpublish_total" "${output}" assert_match "rotation_reconcile_total" "${output}" + assert_match "sync_k8s_secret_total" "${output}" done }