From ea98aa7a93d4d32a633a9151e7507cca3a07a6c9 Mon Sep 17 00:00:00 2001 From: xiaozhuang <61875985+xiaozhuang-a@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:31:22 +0800 Subject: [PATCH] feat(cluster): support redis-cluster topologySpreadConstraints(# 868) (#1177) * feat: v1beta2 redis-cluster support TopologySpreadConstraints Signed-off-by: xiaozhuang * chore: topologySpreadConstraints example Signed-off-by: xiaozhuang * chore: lint Signed-off-by: xiaozhuang * chore: lint Signed-off-by: xiaozhuang --------- Signed-off-by: xiaozhuang Co-authored-by: xiaozhuang --- api/common_types.go | 34 +- api/v1beta1/rediscluster_conversion.go | 12 + api/zz_generated.deepcopy.go | 14 + ...is.redis.opstreelabs.in_redisclusters.yaml | 720 ++++++++++++++++++ .../redis-cluster.yaml | 71 ++ pkg/k8sutils/redis-cluster.go | 10 +- pkg/k8sutils/redis-cluster_test.go | 2 + pkg/k8sutils/statefulset.go | 2 + 8 files changed, 846 insertions(+), 19 deletions(-) create mode 100644 example/v1beta2/topology_spread_constraints/redis-cluster.yaml diff --git a/api/common_types.go b/api/common_types.go index 378bda1bc..883c39ddb 100644 --- a/api/common_types.go +++ b/api/common_types.go @@ -105,27 +105,29 @@ type Sidecar struct { // RedisLeader interface will have the redis leader configuration // +k8s:deepcopy-gen=true type RedisLeader struct { - Replicas *int32 `json:"replicas,omitempty"` - RedisConfig *RedisConfig `json:"redisConfig,omitempty"` - Affinity *corev1.Affinity `json:"affinity,omitempty"` - PodDisruptionBudget *RedisPodDisruptionBudget `json:"pdb,omitempty"` - ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty" protobuf:"bytes,11,opt,name=readinessProbe"` - LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty" protobuf:"bytes,12,opt,name=livenessProbe"` - Tolerations *[]corev1.Toleration `json:"tolerations,omitempty"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` + RedisConfig *RedisConfig `json:"redisConfig,omitempty"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + PodDisruptionBudget *RedisPodDisruptionBudget `json:"pdb,omitempty"` + ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty" protobuf:"bytes,11,opt,name=readinessProbe"` + LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty" protobuf:"bytes,12,opt,name=livenessProbe"` + Tolerations *[]corev1.Toleration `json:"tolerations,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` } // RedisFollower interface will have the redis follower configuration // +k8s:deepcopy-gen=true type RedisFollower struct { - Replicas *int32 `json:"replicas,omitempty"` - RedisConfig *RedisConfig `json:"redisConfig,omitempty"` - Affinity *corev1.Affinity `json:"affinity,omitempty"` - PodDisruptionBudget *RedisPodDisruptionBudget `json:"pdb,omitempty"` - ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty" protobuf:"bytes,11,opt,name=readinessProbe"` - LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty" protobuf:"bytes,12,opt,name=livenessProbe"` - Tolerations *[]corev1.Toleration `json:"tolerations,omitempty"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` + RedisConfig *RedisConfig `json:"redisConfig,omitempty"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + PodDisruptionBudget *RedisPodDisruptionBudget `json:"pdb,omitempty"` + ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty" protobuf:"bytes,11,opt,name=readinessProbe"` + LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty" protobuf:"bytes,12,opt,name=livenessProbe"` + Tolerations *[]corev1.Toleration `json:"tolerations,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` } // RedisPodDisruptionBudget configure a PodDisruptionBudget on the resource (leader/follower) diff --git a/api/v1beta1/rediscluster_conversion.go b/api/v1beta1/rediscluster_conversion.go index 0ee4f3018..02064cda8 100644 --- a/api/v1beta1/rediscluster_conversion.go +++ b/api/v1beta1/rediscluster_conversion.go @@ -48,6 +48,9 @@ func (src *RedisCluster) ConvertTo(dstRaw conversion.Hub) error { if src.Spec.RedisLeader.NodeSelector != nil { dst.Spec.RedisLeader.NodeSelector = src.Spec.RedisLeader.NodeSelector } + if src.Spec.RedisLeader.TopologySpreadConstraints != nil { + dst.Spec.RedisLeader.TopologySpreadConstraints = src.Spec.RedisLeader.TopologySpreadConstraints + } // RedisFollower dst.Spec.RedisFollower = redisv1beta2.RedisFollower{} @@ -75,6 +78,9 @@ func (src *RedisCluster) ConvertTo(dstRaw conversion.Hub) error { if src.Spec.RedisFollower.NodeSelector != nil { dst.Spec.RedisFollower.NodeSelector = src.Spec.RedisFollower.NodeSelector } + if src.Spec.RedisFollower.TopologySpreadConstraints != nil { + dst.Spec.RedisFollower.TopologySpreadConstraints = src.Spec.RedisFollower.TopologySpreadConstraints + } // RedisExporter if src.Spec.RedisExporter != nil { dst.Spec.RedisExporter = &redisv1beta2.RedisExporter{} @@ -168,6 +174,9 @@ func (dst *RedisCluster) ConvertFrom(srcRaw conversion.Hub) error { if src.Spec.RedisLeader.NodeSelector != nil { dst.Spec.RedisLeader.NodeSelector = src.Spec.RedisLeader.NodeSelector } + if src.Spec.RedisLeader.TopologySpreadConstraints != nil { + dst.Spec.RedisLeader.TopologySpreadConstraints = src.Spec.RedisLeader.TopologySpreadConstraints + } // RedisFollower dst.Spec.RedisFollower = RedisFollower{} @@ -195,6 +204,9 @@ func (dst *RedisCluster) ConvertFrom(srcRaw conversion.Hub) error { if src.Spec.RedisFollower.NodeSelector != nil { dst.Spec.RedisFollower.NodeSelector = src.Spec.RedisFollower.NodeSelector } + if src.Spec.RedisFollower.TopologySpreadConstraints != nil { + dst.Spec.RedisFollower.TopologySpreadConstraints = src.Spec.RedisFollower.TopologySpreadConstraints + } // RedisExporter if src.Spec.RedisExporter != nil { dst.Spec.RedisExporter = &RedisExporter{} diff --git a/api/zz_generated.deepcopy.go b/api/zz_generated.deepcopy.go index 004200305..176a705b1 100644 --- a/api/zz_generated.deepcopy.go +++ b/api/zz_generated.deepcopy.go @@ -240,6 +240,13 @@ func (in *RedisFollower) DeepCopyInto(out *RedisFollower) { (*out)[key] = val } } + if in.TopologySpreadConstraints != nil { + in, out := &in.TopologySpreadConstraints, &out.TopologySpreadConstraints + *out = make([]v1.TopologySpreadConstraint, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RedisFollower. @@ -303,6 +310,13 @@ func (in *RedisLeader) DeepCopyInto(out *RedisLeader) { (*out)[key] = val } } + if in.TopologySpreadConstraints != nil { + in, out := &in.TopologySpreadConstraints, &out.TopologySpreadConstraints + *out = make([]v1.TopologySpreadConstraint, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RedisLeader. diff --git a/config/crd/bases/redis.redis.opstreelabs.in_redisclusters.yaml b/config/crd/bases/redis.redis.opstreelabs.in_redisclusters.yaml index 9da67022d..4d018e89b 100644 --- a/config/crd/bases/redis.redis.opstreelabs.in_redisclusters.yaml +++ b/config/crd/bases/redis.redis.opstreelabs.in_redisclusters.yaml @@ -1915,6 +1915,186 @@ spec: type: string type: object type: array + topologySpreadConstraints: + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: |- + LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine the number of pods + in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: |- + MatchLabelKeys is a set of pod label keys to select the pods over which + spreading will be calculated. The keys are used to lookup values from the + incoming pod labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading will be calculated + for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. + MatchLabelKeys cannot be set when LabelSelector isn't set. + Keys that don't exist in the incoming pod labels will + be ignored. A null or empty list means only match against labelSelector. + + + This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: |- + MaxSkew describes the degree to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference + between the number of matching pods in the target topology and the global minimum. + The global minimum is the minimum number of matching pods in an eligible domain + or zero if the number of eligible domains is less than MinDomains. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 2/2/1: + In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | + | P P | P P | P | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; + scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) + violate MaxSkew(1). + - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence + to topologies that satisfy it. + It's a required field. Default value is 1 and 0 is not allowed. + format: int32 + type: integer + minDomains: + description: |- + MinDomains indicates a minimum number of eligible domains. + When the number of eligible domains with matching topology keys is less than minDomains, + Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. + And when the number of eligible domains with matching topology keys equals or greater than minDomains, + this value has no effect on scheduling. + As a result, when the number of eligible domains is less than minDomains, + scheduler won't schedule more than maxSkew Pods to those domains. + If value is nil, the constraint behaves as if MinDomains is equal to 1. + Valid values are integers greater than 0. + When value is not nil, WhenUnsatisfiable must be DoNotSchedule. + + + For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same + labelSelector spread as 2/2/2: + | zone1 | zone2 | zone3 | + | P P | P P | P P | + The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. + In this situation, new pod with the same labelSelector cannot be scheduled, + because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. + + + This is a beta field and requires the MinDomainsInPodTopologySpread feature gate to be enabled (enabled by default). + format: int32 + type: integer + nodeAffinityPolicy: + description: |- + NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector + when calculating pod topology spread skew. Options are: + - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. + + + If this value is nil, the behavior is equivalent to the Honor policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + nodeTaintsPolicy: + description: |- + NodeTaintsPolicy indicates how we will treat node taints when calculating + pod topology spread skew. Options are: + - Honor: nodes without taints, along with tainted nodes for which the incoming pod + has a toleration, are included. + - Ignore: node taints are ignored. All nodes are included. + + + If this value is nil, the behavior is equivalent to the Ignore policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + topologyKey: + description: |- + TopologyKey is the key of node labels. Nodes that have a label with this key + and identical values are considered to be in the same topology. + We consider each as a "bucket", and try to put balanced number + of pods into each bucket. + We define a domain as a particular instance of a topology. + Also, we define an eligible domain as a domain whose nodes meet the requirements of + nodeAffinityPolicy and nodeTaintsPolicy. + e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. + And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. + It's a required field. + type: string + whenUnsatisfiable: + description: |- + WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + the spread constraint. + - DoNotSchedule (default) tells the scheduler not to schedule it. + - ScheduleAnyway tells the scheduler to schedule the pod in any location, + but giving higher precedence to topologies that would help reduce the + skew. + A constraint is considered "Unsatisfiable" for an incoming pod + if and only if every possible node assignment for that pod would violate + "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 3/1/1: + | zone1 | zone2 | zone3 | + | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + won't make it *more* imbalanced. + It's a required field. + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array type: object redisLeader: description: RedisLeader interface will have the redis leader configuration @@ -3181,6 +3361,186 @@ spec: type: string type: object type: array + topologySpreadConstraints: + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: |- + LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine the number of pods + in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: |- + MatchLabelKeys is a set of pod label keys to select the pods over which + spreading will be calculated. The keys are used to lookup values from the + incoming pod labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading will be calculated + for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. + MatchLabelKeys cannot be set when LabelSelector isn't set. + Keys that don't exist in the incoming pod labels will + be ignored. A null or empty list means only match against labelSelector. + + + This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: |- + MaxSkew describes the degree to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference + between the number of matching pods in the target topology and the global minimum. + The global minimum is the minimum number of matching pods in an eligible domain + or zero if the number of eligible domains is less than MinDomains. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 2/2/1: + In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | + | P P | P P | P | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; + scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) + violate MaxSkew(1). + - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence + to topologies that satisfy it. + It's a required field. Default value is 1 and 0 is not allowed. + format: int32 + type: integer + minDomains: + description: |- + MinDomains indicates a minimum number of eligible domains. + When the number of eligible domains with matching topology keys is less than minDomains, + Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. + And when the number of eligible domains with matching topology keys equals or greater than minDomains, + this value has no effect on scheduling. + As a result, when the number of eligible domains is less than minDomains, + scheduler won't schedule more than maxSkew Pods to those domains. + If value is nil, the constraint behaves as if MinDomains is equal to 1. + Valid values are integers greater than 0. + When value is not nil, WhenUnsatisfiable must be DoNotSchedule. + + + For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same + labelSelector spread as 2/2/2: + | zone1 | zone2 | zone3 | + | P P | P P | P P | + The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. + In this situation, new pod with the same labelSelector cannot be scheduled, + because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. + + + This is a beta field and requires the MinDomainsInPodTopologySpread feature gate to be enabled (enabled by default). + format: int32 + type: integer + nodeAffinityPolicy: + description: |- + NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector + when calculating pod topology spread skew. Options are: + - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. + + + If this value is nil, the behavior is equivalent to the Honor policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + nodeTaintsPolicy: + description: |- + NodeTaintsPolicy indicates how we will treat node taints when calculating + pod topology spread skew. Options are: + - Honor: nodes without taints, along with tainted nodes for which the incoming pod + has a toleration, are included. + - Ignore: node taints are ignored. All nodes are included. + + + If this value is nil, the behavior is equivalent to the Ignore policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + topologyKey: + description: |- + TopologyKey is the key of node labels. Nodes that have a label with this key + and identical values are considered to be in the same topology. + We consider each as a "bucket", and try to put balanced number + of pods into each bucket. + We define a domain as a particular instance of a topology. + Also, we define an eligible domain as a domain whose nodes meet the requirements of + nodeAffinityPolicy and nodeTaintsPolicy. + e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. + And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. + It's a required field. + type: string + whenUnsatisfiable: + description: |- + WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + the spread constraint. + - DoNotSchedule (default) tells the scheduler not to schedule it. + - ScheduleAnyway tells the scheduler to schedule the pod in any location, + but giving higher precedence to topologies that would help reduce the + skew. + A constraint is considered "Unsatisfiable" for an incoming pod + if and only if every possible node assignment for that pod would violate + "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 3/1/1: + | zone1 | zone2 | zone3 | + | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + won't make it *more* imbalanced. + It's a required field. + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array type: object resources: description: ResourceRequirements describes the compute resource requirements. @@ -8578,6 +8938,186 @@ spec: type: string type: object type: array + topologySpreadConstraints: + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: |- + LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine the number of pods + in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: |- + MatchLabelKeys is a set of pod label keys to select the pods over which + spreading will be calculated. The keys are used to lookup values from the + incoming pod labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading will be calculated + for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. + MatchLabelKeys cannot be set when LabelSelector isn't set. + Keys that don't exist in the incoming pod labels will + be ignored. A null or empty list means only match against labelSelector. + + + This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: |- + MaxSkew describes the degree to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference + between the number of matching pods in the target topology and the global minimum. + The global minimum is the minimum number of matching pods in an eligible domain + or zero if the number of eligible domains is less than MinDomains. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 2/2/1: + In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | + | P P | P P | P | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; + scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) + violate MaxSkew(1). + - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence + to topologies that satisfy it. + It's a required field. Default value is 1 and 0 is not allowed. + format: int32 + type: integer + minDomains: + description: |- + MinDomains indicates a minimum number of eligible domains. + When the number of eligible domains with matching topology keys is less than minDomains, + Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. + And when the number of eligible domains with matching topology keys equals or greater than minDomains, + this value has no effect on scheduling. + As a result, when the number of eligible domains is less than minDomains, + scheduler won't schedule more than maxSkew Pods to those domains. + If value is nil, the constraint behaves as if MinDomains is equal to 1. + Valid values are integers greater than 0. + When value is not nil, WhenUnsatisfiable must be DoNotSchedule. + + + For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same + labelSelector spread as 2/2/2: + | zone1 | zone2 | zone3 | + | P P | P P | P P | + The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. + In this situation, new pod with the same labelSelector cannot be scheduled, + because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. + + + This is a beta field and requires the MinDomainsInPodTopologySpread feature gate to be enabled (enabled by default). + format: int32 + type: integer + nodeAffinityPolicy: + description: |- + NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector + when calculating pod topology spread skew. Options are: + - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. + + + If this value is nil, the behavior is equivalent to the Honor policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + nodeTaintsPolicy: + description: |- + NodeTaintsPolicy indicates how we will treat node taints when calculating + pod topology spread skew. Options are: + - Honor: nodes without taints, along with tainted nodes for which the incoming pod + has a toleration, are included. + - Ignore: node taints are ignored. All nodes are included. + + + If this value is nil, the behavior is equivalent to the Ignore policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + topologyKey: + description: |- + TopologyKey is the key of node labels. Nodes that have a label with this key + and identical values are considered to be in the same topology. + We consider each as a "bucket", and try to put balanced number + of pods into each bucket. + We define a domain as a particular instance of a topology. + Also, we define an eligible domain as a domain whose nodes meet the requirements of + nodeAffinityPolicy and nodeTaintsPolicy. + e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. + And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. + It's a required field. + type: string + whenUnsatisfiable: + description: |- + WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + the spread constraint. + - DoNotSchedule (default) tells the scheduler not to schedule it. + - ScheduleAnyway tells the scheduler to schedule the pod in any location, + but giving higher precedence to topologies that would help reduce the + skew. + A constraint is considered "Unsatisfiable" for an incoming pod + if and only if every possible node assignment for that pod would violate + "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 3/1/1: + | zone1 | zone2 | zone3 | + | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + won't make it *more* imbalanced. + It's a required field. + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array type: object redisLeader: description: RedisLeader interface will have the redis leader configuration @@ -10017,6 +10557,186 @@ spec: type: string type: object type: array + topologySpreadConstraints: + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: |- + LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine the number of pods + in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: |- + MatchLabelKeys is a set of pod label keys to select the pods over which + spreading will be calculated. The keys are used to lookup values from the + incoming pod labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading will be calculated + for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. + MatchLabelKeys cannot be set when LabelSelector isn't set. + Keys that don't exist in the incoming pod labels will + be ignored. A null or empty list means only match against labelSelector. + + + This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: |- + MaxSkew describes the degree to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference + between the number of matching pods in the target topology and the global minimum. + The global minimum is the minimum number of matching pods in an eligible domain + or zero if the number of eligible domains is less than MinDomains. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 2/2/1: + In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | + | P P | P P | P | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; + scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) + violate MaxSkew(1). + - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence + to topologies that satisfy it. + It's a required field. Default value is 1 and 0 is not allowed. + format: int32 + type: integer + minDomains: + description: |- + MinDomains indicates a minimum number of eligible domains. + When the number of eligible domains with matching topology keys is less than minDomains, + Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. + And when the number of eligible domains with matching topology keys equals or greater than minDomains, + this value has no effect on scheduling. + As a result, when the number of eligible domains is less than minDomains, + scheduler won't schedule more than maxSkew Pods to those domains. + If value is nil, the constraint behaves as if MinDomains is equal to 1. + Valid values are integers greater than 0. + When value is not nil, WhenUnsatisfiable must be DoNotSchedule. + + + For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same + labelSelector spread as 2/2/2: + | zone1 | zone2 | zone3 | + | P P | P P | P P | + The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. + In this situation, new pod with the same labelSelector cannot be scheduled, + because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. + + + This is a beta field and requires the MinDomainsInPodTopologySpread feature gate to be enabled (enabled by default). + format: int32 + type: integer + nodeAffinityPolicy: + description: |- + NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector + when calculating pod topology spread skew. Options are: + - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. + + + If this value is nil, the behavior is equivalent to the Honor policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + nodeTaintsPolicy: + description: |- + NodeTaintsPolicy indicates how we will treat node taints when calculating + pod topology spread skew. Options are: + - Honor: nodes without taints, along with tainted nodes for which the incoming pod + has a toleration, are included. + - Ignore: node taints are ignored. All nodes are included. + + + If this value is nil, the behavior is equivalent to the Ignore policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. + type: string + topologyKey: + description: |- + TopologyKey is the key of node labels. Nodes that have a label with this key + and identical values are considered to be in the same topology. + We consider each as a "bucket", and try to put balanced number + of pods into each bucket. + We define a domain as a particular instance of a topology. + Also, we define an eligible domain as a domain whose nodes meet the requirements of + nodeAffinityPolicy and nodeTaintsPolicy. + e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. + And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. + It's a required field. + type: string + whenUnsatisfiable: + description: |- + WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + the spread constraint. + - DoNotSchedule (default) tells the scheduler not to schedule it. + - ScheduleAnyway tells the scheduler to schedule the pod in any location, + but giving higher precedence to topologies that would help reduce the + skew. + A constraint is considered "Unsatisfiable" for an incoming pod + if and only if every possible node assignment for that pod would violate + "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 3/1/1: + | zone1 | zone2 | zone3 | + | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + won't make it *more* imbalanced. + It's a required field. + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array type: object resources: description: ResourceRequirements describes the compute resource requirements. diff --git a/example/v1beta2/topology_spread_constraints/redis-cluster.yaml b/example/v1beta2/topology_spread_constraints/redis-cluster.yaml new file mode 100644 index 000000000..58c53b562 --- /dev/null +++ b/example/v1beta2/topology_spread_constraints/redis-cluster.yaml @@ -0,0 +1,71 @@ +--- +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisCluster +metadata: + name: redis-cluster + labels: + clusterId: redis-cluster +spec: + clusterSize: 3 + clusterVersion: v7 + persistenceEnabled: true + podSecurityContext: + runAsUser: 1000 + fsGroup: 1000 + kubernetesConfig: + image: quay.io/opstree/redis:v7.0.12 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 101m + memory: 128Mi + limits: + cpu: 101m + memory: 128Mi + # redisSecret: + # name: redis-secret + # key: password + redisLeader: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + role: leader + clusterId: redis-cluster + redisFollower: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + role: follower + clusterId: redis-cluster + redisExporter: + enabled: false + image: quay.io/opstree/redis-exporter:v1.44.0 + imagePullPolicy: Always + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 100m + memory: 128Mi + storage: + volumeClaimTemplate: + spec: + # storageClassName: standard + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi + nodeConfVolume: true + nodeConfVolumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi diff --git a/pkg/k8sutils/redis-cluster.go b/pkg/k8sutils/redis-cluster.go index beb6ac9b7..626f20e1b 100644 --- a/pkg/k8sutils/redis-cluster.go +++ b/pkg/k8sutils/redis-cluster.go @@ -24,6 +24,7 @@ type RedisClusterSTS struct { ReadinessProbe *corev1.Probe LivenessProbe *corev1.Probe NodeSelector map[string]string + TopologySpreadConstraints []corev1.TopologySpreadConstraint Tolerations *[]corev1.Toleration } @@ -43,6 +44,7 @@ func generateRedisClusterParams(ctx context.Context, cr *redisv1beta2.RedisClust ClusterMode: true, ClusterVersion: cr.Spec.ClusterVersion, NodeSelector: params.NodeSelector, + TopologySpreadConstraints: params.TopologySpreadConstraints, PodSecurityContext: cr.Spec.PodSecurityContext, PriorityClassName: cr.Spec.PriorityClassName, Affinity: params.Affinity, @@ -219,9 +221,11 @@ func CreateRedisLeader(ctx context.Context, cr *redisv1beta2.RedisCluster, cl ku Affinity: cr.Spec.RedisLeader.Affinity, TerminationGracePeriodSeconds: cr.Spec.RedisLeader.TerminationGracePeriodSeconds, NodeSelector: cr.Spec.RedisLeader.NodeSelector, - Tolerations: cr.Spec.RedisLeader.Tolerations, - ReadinessProbe: cr.Spec.RedisLeader.ReadinessProbe, - LivenessProbe: cr.Spec.RedisLeader.LivenessProbe, + TopologySpreadConstraints: cr.Spec.RedisLeader.TopologySpreadConstraints, + + Tolerations: cr.Spec.RedisLeader.Tolerations, + ReadinessProbe: cr.Spec.RedisLeader.ReadinessProbe, + LivenessProbe: cr.Spec.RedisLeader.LivenessProbe, } if cr.Spec.RedisLeader.RedisConfig != nil { prop.ExternalConfig = cr.Spec.RedisLeader.RedisConfig.AdditionalRedisConfig diff --git a/pkg/k8sutils/redis-cluster_test.go b/pkg/k8sutils/redis-cluster_test.go index 90993475d..b8881fee9 100644 --- a/pkg/k8sutils/redis-cluster_test.go +++ b/pkg/k8sutils/redis-cluster_test.go @@ -174,6 +174,7 @@ func Test_generateRedisClusterParams(t *testing.T) { ReadinessProbe: input.Spec.RedisLeader.ReadinessProbe, LivenessProbe: input.Spec.RedisLeader.LivenessProbe, NodeSelector: input.Spec.RedisLeader.NodeSelector, + TopologySpreadConstraints: input.Spec.RedisLeader.TopologySpreadConstraints, Tolerations: input.Spec.RedisLeader.Tolerations, }) assert.EqualValues(t, expectedLeaderSTS, actualLeaderSTS, "Expected %+v, got %+v", expectedLeaderSTS, actualLeaderSTS) @@ -187,6 +188,7 @@ func Test_generateRedisClusterParams(t *testing.T) { ReadinessProbe: input.Spec.RedisFollower.ReadinessProbe, LivenessProbe: input.Spec.RedisFollower.LivenessProbe, NodeSelector: input.Spec.RedisFollower.NodeSelector, + TopologySpreadConstraints: input.Spec.RedisFollower.TopologySpreadConstraints, Tolerations: input.Spec.RedisFollower.Tolerations, }) assert.EqualValues(t, expectedFollowerSTS, actualFollowerSTS, "Expected %+v, got %+v", expectedFollowerSTS, actualFollowerSTS) diff --git a/pkg/k8sutils/statefulset.go b/pkg/k8sutils/statefulset.go index 63060e1fb..b593b8b95 100644 --- a/pkg/k8sutils/statefulset.go +++ b/pkg/k8sutils/statefulset.go @@ -87,6 +87,7 @@ type statefulSetParameters struct { ClusterVersion *string NodeConfVolume bool NodeSelector map[string]string + TopologySpreadConstraints []corev1.TopologySpreadConstraint PodSecurityContext *corev1.PodSecurityContext PriorityClassName string Affinity *corev1.Affinity @@ -297,6 +298,7 @@ func generateStatefulSetsDef(stsMeta metav1.ObjectMeta, params statefulSetParame sidecars, ), NodeSelector: params.NodeSelector, + TopologySpreadConstraints: params.TopologySpreadConstraints, SecurityContext: params.PodSecurityContext, PriorityClassName: params.PriorityClassName, Affinity: params.Affinity,