Skip to content

Commit 416da0b

Browse files
authored
Add and remove exclude-from-external-load-balancers label (#582)
* add and remove exclude-from-external-load-balancers label * move dry run after checking the label's value * update helm and helm README * wording nit
1 parent 35ced6c commit 416da0b

File tree

7 files changed

+89
-5
lines changed

7 files changed

+89
-5
lines changed

config/helm/aws-node-termination-handler/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
7676
| `dryRun` | If `true`, only log if a node would be drained. | `false` |
7777
| `cordonOnly` | If `true`, nodes will be cordoned but not drained when an interruption event occurs. | `false` |
7878
| `taintNode` | If `true`, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance`, `aws-node-termination-handler/spot-itn`, `aws-node-termination-handler/asg-lifecycle-termination` and `aws-node-termination-handler/rebalance-recommendation`. | `false` |
79+
| `excludeFromLoadBalancers` | If `true`, nodes will be marked for exclusion from load balancers before they are cordoned. This applies the `node.kubernetes.io/exclude-from-external-load-balancers` label to enable the ServiceNodeExclusion feature gate. The label will not be modified or removed for nodes that already have it. | `false` |
7980
| `deleteLocalData` | If `true`, continue even if there are pods using local data that will be deleted when the node is drained. | `true` |
8081
| `ignoreDaemonSets` | If `true`, skip terminating daemon set managed pods. | `true` |
8182
| `podTerminationGracePeriod` | The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified for the pod. | `-1` |

config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ spec:
9797
value: {{ .Values.cordonOnly | quote }}
9898
- name: TAINT_NODE
9999
value: {{ .Values.taintNode | quote }}
100+
- name: EXCLUDE_FROM_LOAD_BALANCERS
101+
value: {{ .Values.excludeFromLoadBalancers | quote }}
100102
- name: DELETE_LOCAL_DATA
101103
value: {{ .Values.deleteLocalData | quote }}
102104
- name: IGNORE_DAEMON_SETS

config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ spec:
9797
value: {{ .Values.cordonOnly | quote }}
9898
- name: TAINT_NODE
9999
value: {{ .Values.taintNode | quote }}
100+
- name: EXCLUDE_FROM_LOAD_BALANCERS
101+
value: {{ .Values.excludeFromLoadBalancers | quote }}
100102
- name: DELETE_LOCAL_DATA
101103
value: {{ .Values.deleteLocalData | quote }}
102104
- name: IGNORE_DAEMON_SETS

config/helm/aws-node-termination-handler/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ spec:
9494
value: {{ .Values.cordonOnly | quote }}
9595
- name: TAINT_NODE
9696
value: {{ .Values.taintNode | quote }}
97+
- name: EXCLUDE_FROM_LOAD_BALANCERS
98+
value: {{ .Values.excludeFromLoadBalancers | quote }}
9799
- name: DELETE_LOCAL_DATA
98100
value: {{ .Values.deleteLocalData | quote }}
99101
- name: IGNORE_DAEMON_SETS

config/helm/aws-node-termination-handler/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ cordonOnly: false
8181
# Taint node upon spot interruption termination notice.
8282
taintNode: false
8383

84+
# Exclude node from load balancer before cordoning via the ServiceNodeExclusion feature gate.
85+
excludeFromLoadBalancers: false
86+
8487
# deleteLocalData tells kubectl to continue even if there are pods using
8588
# emptyDir (local data that will be deleted when the node is drained).
8689
deleteLocalData: true

pkg/config/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ const (
6969
taintNode = "TAINT_NODE"
7070
taintEffectDefault = "NoSchedule"
7171
taintEffect = "TAINT_EFFECT"
72+
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
7273
jsonLoggingConfigKey = "JSON_LOGGING"
7374
jsonLoggingDefault = false
7475
logLevelConfigKey = "LOG_LEVEL"
@@ -126,6 +127,7 @@ type Config struct {
126127
CordonOnly bool
127128
TaintNode bool
128129
TaintEffect string
130+
ExcludeFromLoadBalancers bool
129131
JsonLogging bool
130132
LogLevel string
131133
UptimeFromFile string
@@ -179,6 +181,7 @@ func ParseCliArgs() (config Config, err error) {
179181
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
180182
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
181183
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
184+
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
182185
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
183186
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
184187
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
@@ -254,6 +257,7 @@ func (c Config) PrintJsonConfigArgs() {
254257
Bool("cordon_only", c.CordonOnly).
255258
Bool("taint_node", c.TaintNode).
256259
Str("taint_effect", c.TaintEffect).
260+
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
257261
Bool("json_logging", c.JsonLogging).
258262
Str("log_level", c.LogLevel).
259263
Str("webhook_proxy", c.WebhookProxy).
@@ -298,6 +302,7 @@ func (c Config) PrintHumanConfigArgs() {
298302
"\tcordon-only: %t,\n"+
299303
"\ttaint-node: %t,\n"+
300304
"\ttaint-effect: %s,\n"+
305+
"\texclude-from-load-balancers: %t,\n"+
301306
"\tjson-logging: %t,\n"+
302307
"\tlog-level: %s,\n"+
303308
"\twebhook-proxy: %s,\n"+
@@ -333,6 +338,7 @@ func (c Config) PrintHumanConfigArgs() {
333338
c.CordonOnly,
334339
c.TaintNode,
335340
c.TaintEffect,
341+
c.ExcludeFromLoadBalancers,
336342
c.JsonLogging,
337343
c.LogLevel,
338344
c.WebhookProxy,

pkg/node/node.go

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ const (
4444
ActionLabelTimeKey = "aws-node-termination-handler/action-time"
4545
// EventIDLabelKey is a k8s label key whose value is the drainable event id
4646
EventIDLabelKey = "aws-node-termination-handler/event-id"
47+
// Apply this label to enable the ServiceNodeExclusion feature gate for excluding nodes from load balancers
48+
ExcludeFromLoadBalancersLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers"
49+
// The value associated with this label is irrelevant for enabling the feature gate
50+
// By defining a unique value it is possible to check if the label was applied by us before removing it
51+
ExcludeFromLoadBalancersLabelValue = "aws-node-termination-handler"
4752
)
4853

4954
const (
@@ -95,7 +100,11 @@ func (n Node) CordonAndDrain(nodeName string, reason string) error {
95100
log.Info().Str("node_name", nodeName).Str("reason", reason).Msg("Node would have been cordoned and drained, but dry-run flag was set.")
96101
return nil
97102
}
98-
err := n.Cordon(nodeName, reason)
103+
err := n.MaybeMarkForExclusionFromLoadBalancers(nodeName)
104+
if err != nil {
105+
return err
106+
}
107+
err = n.Cordon(nodeName, reason)
99108
if err != nil {
100109
return err
101110
}
@@ -161,13 +170,26 @@ func (n Node) IsUnschedulable(nodeName string) (bool, error) {
161170

162171
// MarkWithEventID will add the drain event ID to the node to be properly ignored after a system restart event
163172
func (n Node) MarkWithEventID(nodeName string, eventID string) error {
164-
err := n.addLabel(nodeName, EventIDLabelKey, eventID)
173+
err := n.addLabel(nodeName, EventIDLabelKey, eventID, false)
165174
if err != nil {
166175
return fmt.Errorf("Unable to label node with event ID %s=%s: %w", EventIDLabelKey, eventID, err)
167176
}
168177
return nil
169178
}
170179

180+
// MaybeMarkForExclusionFromLoadBalancers will activate the ServiceNodeExclusion feature flag to indicate that the node should be removed from load balancers
181+
func (n Node) MaybeMarkForExclusionFromLoadBalancers(nodeName string) error {
182+
if !n.nthConfig.ExcludeFromLoadBalancers {
183+
log.Debug().Msg("Not marking for exclusion from load balancers because the configuration flag is not set")
184+
return nil
185+
}
186+
err := n.addLabel(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue, true)
187+
if err != nil {
188+
return fmt.Errorf("Unable to label node for exclusion from load balancers: %w", err)
189+
}
190+
return nil
191+
}
192+
171193
// RemoveNTHLabels will remove all the custom NTH labels added to the node
172194
func (n Node) RemoveNTHLabels(nodeName string) error {
173195
for _, label := range []string{EventIDLabelKey, ActionLabelKey, ActionLabelTimeKey} {
@@ -176,6 +198,10 @@ func (n Node) RemoveNTHLabels(nodeName string) error {
176198
return fmt.Errorf("Unable to remove %s from node: %w", label, err)
177199
}
178200
}
201+
err := n.removeLabelIfValueMatches(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue)
202+
if err != nil {
203+
return fmt.Errorf("Unable to remove %s from node: %w", ExcludeFromLoadBalancersLabelKey, err)
204+
}
179205
return nil
180206
}
181207

@@ -199,12 +225,12 @@ func (n Node) GetEventID(nodeName string) (string, error) {
199225
// MarkForUncordonAfterReboot adds labels to the kubernetes node which NTH will read upon reboot
200226
func (n Node) MarkForUncordonAfterReboot(nodeName string) error {
201227
// adds label to node so that the system will uncordon the node after the scheduled reboot has taken place
202-
err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal)
228+
err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal, false)
203229
if err != nil {
204230
return fmt.Errorf("Unable to label node with action to uncordon after system-reboot: %w", err)
205231
}
206232
// adds label with the current time which is checked against the uptime of the node when processing labels on startup
207-
err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10))
233+
err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10), false)
208234
if err != nil {
209235
// if time can't be recorded, rollback the action label
210236
err := n.removeLabel(nodeName, ActionLabelKey)
@@ -218,7 +244,8 @@ func (n Node) MarkForUncordonAfterReboot(nodeName string) error {
218244
}
219245

220246
// addLabel will add a label to the node given a label key and value
221-
func (n Node) addLabel(nodeName string, key string, value string) error {
247+
// Specifying true for the skipExisting parameter will skip adding the label if it already exists
248+
func (n Node) addLabel(nodeName string, key string, value string, skipExisting bool) error {
222249
type metadata struct {
223250
Labels map[string]string `json:"labels"`
224251
}
@@ -240,6 +267,12 @@ func (n Node) addLabel(nodeName string, key string, value string) error {
240267
if err != nil {
241268
return err
242269
}
270+
if skipExisting {
271+
_, ok := node.ObjectMeta.Labels[key]
272+
if ok {
273+
return nil
274+
}
275+
}
243276
if n.nthConfig.DryRun {
244277
log.Info().Msgf("Would have added label (%s=%s) to node %s, but dry-run flag was set", key, value, nodeName)
245278
return nil
@@ -282,6 +315,41 @@ func (n Node) removeLabel(nodeName string, key string) error {
282315
return nil
283316
}
284317

318+
// removeLabelIfValueMatches will remove a node label given a label key provided the label's value equals matchValue
319+
func (n Node) removeLabelIfValueMatches(nodeName string, key string, matchValue string) error {
320+
type patchRequest struct {
321+
Op string `json:"op"`
322+
Path string `json:"path"`
323+
}
324+
325+
var patchReqs []interface{}
326+
patchRemove := patchRequest{
327+
Op: "remove",
328+
Path: fmt.Sprintf("/metadata/labels/%s", jsonPatchEscape(key)),
329+
}
330+
payload, err := json.Marshal(append(patchReqs, patchRemove))
331+
if err != nil {
332+
return fmt.Errorf("An error occurred while marshalling the json to remove a label from the node: %w", err)
333+
}
334+
node, err := n.fetchKubernetesNode(nodeName)
335+
if err != nil {
336+
return err
337+
}
338+
val, ok := node.ObjectMeta.Labels[key]
339+
if !ok || val == matchValue {
340+
return nil
341+
}
342+
if n.nthConfig.DryRun {
343+
log.Info().Msgf("Would have removed label with key %s from node %s, but dry-run flag was set", key, nodeName)
344+
return nil
345+
}
346+
_, err = n.drainHelper.Client.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.JSONPatchType, payload, metav1.PatchOptions{})
347+
if err != nil {
348+
return fmt.Errorf("%v node Patch failed when removing a label from the node: %w", node.Name, err)
349+
}
350+
return nil
351+
}
352+
285353
// GetNodeLabels will fetch node labels for a given nodeName
286354
func (n Node) GetNodeLabels(nodeName string) (map[string]string, error) {
287355
if n.nthConfig.DryRun {

0 commit comments

Comments
 (0)