Skip to content

Commit 9c79dd3

Browse files
authored
Use accessconfig to determine clusterstate (#892)
1 parent 7dbda44 commit 9c79dd3

File tree

3 files changed

+71
-50
lines changed

3 files changed

+71
-50
lines changed

pkg/lib/aws/cloudformation.go

+3-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@ import (
2222
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
2323
)
2424

25-
func (c *Client) ListEKSStacks(controlPlaneStack string, nodegroupStacks ...string) ([]*cloudformation.StackSummary, error) {
25+
func (c *Client) ListEKSStacks(controlPlaneStackName string, nodegroupStackNames strset.Set) ([]*cloudformation.StackSummary, error) {
2626
var stackSummaries []*cloudformation.StackSummary
27-
stackSet := strset.New(nodegroupStacks...)
28-
stackSet.Add(controlPlaneStack)
27+
stackSet := strset.Union(nodegroupStackNames, strset.New(controlPlaneStackName))
2928
err := c.CloudFormation().ListStacksPages(
3029
&cloudformation.ListStacksInput{},
3130
func(listStackOutput *cloudformation.ListStacksOutput, lastPage bool) bool {
@@ -34,7 +33,7 @@ func (c *Client) ListEKSStacks(controlPlaneStack string, nodegroupStacks ...stri
3433
stackSummaries = append(stackSummaries, stackSummary)
3534
}
3635

37-
if *stackSummary.StackName == controlPlaneStack {
36+
if *stackSummary.StackName == controlPlaneStackName {
3837
return false
3938
}
4039
}

pkg/types/clusterstate/clusterstate.go

+60-39
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package clusterstate
1818

1919
import (
2020
"fmt"
21+
"time"
2122

2223
"github.com/aws/aws-sdk-go/service/cloudformation"
2324
"github.com/cortexlabs/cortex/pkg/lib/aws"
@@ -42,8 +43,16 @@ type ClusterState struct {
4243
Status Status
4344
}
4445

45-
func any(statuses []string, allowedStatuses ...string) bool {
46+
func is(status string, allowedStatus string, allowedStatuses ...string) bool {
4647
statusSet := strset.New(allowedStatuses...)
48+
statusSet.Add(allowedStatus)
49+
50+
return statusSet.Has(status)
51+
}
52+
53+
func any(statuses []string, allowedStatus string, allowedStatuses ...string) bool {
54+
statusSet := strset.New(allowedStatuses...)
55+
statusSet.Add(allowedStatus)
4756
for _, stackStatus := range statuses {
4857
if statusSet.Has(stackStatus) {
4958
return true
@@ -53,8 +62,9 @@ func any(statuses []string, allowedStatuses ...string) bool {
5362
return false
5463
}
5564

56-
func all(statuses []string, allowedStatuses ...string) bool {
65+
func all(statuses []string, allowedStatus string, allowedStatuses ...string) bool {
5766
statusSet := strset.New(allowedStatuses...)
67+
statusSet.Add(allowedStatus)
5868
for _, stackStatus := range statuses {
5969
if !statusSet.Has(stackStatus) {
6070
return false
@@ -76,9 +86,8 @@ func (cs ClusterState) TableString() string {
7686

7787
func getStatus(statusMap map[string]string, controlPlane string) (Status, error) {
7888
// the order matters
79-
8089
allStatuses := []string{}
81-
controlPlaneStatus := []string{statusMap[controlPlane]}
90+
controlPlaneStatus := statusMap[controlPlane]
8291
nodeGroupStatuses := []string{}
8392

8493
for stackName, status := range statusMap {
@@ -88,6 +97,19 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
8897
}
8998
}
9099

100+
if any(allStatuses, string(StatusCreateFailedTimedOut)) {
101+
return StatusNotFound, ErrorUnexpectedCloudFormationStatus(s.ObjFlat(statusMap))
102+
}
103+
104+
if len(nodeGroupStatuses) == 0 && controlPlaneStatus == string(StatusNotFound) {
105+
return StatusNotFound, nil
106+
}
107+
108+
// controlplane stack may be created while nodegroup stacks aren't listed in cloudformation stacks during cluster spin up
109+
if len(nodeGroupStatuses) == 0 && is(controlPlaneStatus, cloudformation.StackStatusCreateComplete, cloudformation.StackStatusCreateInProgress) {
110+
return StatusCreateInProgress, nil
111+
}
112+
91113
if any(allStatuses, cloudformation.StackStatusCreateFailed) {
92114
return StatusCreateFailed, nil
93115
}
@@ -96,8 +118,8 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
96118
return StatusDeleteFailed, nil
97119
}
98120

99-
if all(allStatuses, string(StatusNotFound)) {
100-
return StatusCreateComplete, nil
121+
if any(allStatuses, cloudformation.StackStatusDeleteInProgress) {
122+
return StatusDeleteInProgress, nil
101123
}
102124

103125
if all(allStatuses, cloudformation.StackStatusCreateComplete) {
@@ -108,45 +130,54 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
108130
return StatusDeleteComplete, nil
109131
}
110132

111-
if any(allStatuses, cloudformation.StackStatusDeleteInProgress) {
133+
// nodegroup stacks are deleted first while control plane stack is still in create complete state
134+
if controlPlaneStatus == cloudformation.StackStatusCreateComplete &&
135+
all(nodeGroupStatuses, cloudformation.StackStatusDeleteInProgress, cloudformation.StackStatusDeleteComplete) {
112136
return StatusDeleteInProgress, nil
113137
}
114138

115-
// controlplane stack may be in complete state while nodegroup stacks are still in status not found
116-
if all(controlPlaneStatus, cloudformation.StackStatusCreateComplete, cloudformation.StackStatusCreateInProgress) &&
117-
all(nodeGroupStatuses, cloudformation.StackStatusCreateInProgress, string(StatusNotFound), cloudformation.StackStatusCreateComplete) {
139+
// controlplane stack may be in complete state while nodegroup stacks are still in creating or one nodegroup finishes before the other
140+
if controlPlaneStatus == cloudformation.StackStatusCreateComplete &&
141+
all(nodeGroupStatuses, cloudformation.StackStatusCreateInProgress, cloudformation.StackStatusCreateComplete) {
118142
return StatusCreateInProgress, nil
119143
}
120144

121145
return StatusNotFound, ErrorUnexpectedCloudFormationStatus(s.ObjFlat(statusMap))
122146
}
123147

124-
func GetClusterState(awsClient *aws.Client, clusterConfig *clusterconfig.Config) (*ClusterState, error) {
125-
controlPlaneStackName := fmt.Sprintf(controlPlaneTemplate, clusterConfig.ClusterName)
126-
operatorStackName := fmt.Sprintf(operatorTemplate, clusterConfig.ClusterName)
127-
spotStackName := fmt.Sprintf(spotTemplate, clusterConfig.ClusterName)
128-
onDemandStackName := fmt.Sprintf(onDemandTemplate, clusterConfig.ClusterName)
129-
130-
nodeGroupStackNames := []string{operatorStackName}
131-
if clusterConfig.Spot != nil && *clusterConfig.Spot {
132-
nodeGroupStackNames = append(nodeGroupStackNames, spotStackName)
133-
if clusterConfig.SpotConfig != nil && clusterConfig.SpotConfig.OnDemandBackup != nil && *clusterConfig.SpotConfig.OnDemandBackup {
134-
nodeGroupStackNames = append(nodeGroupStackNames, onDemandStackName)
135-
}
136-
} else {
137-
nodeGroupStackNames = append(nodeGroupStackNames, onDemandStackName)
138-
}
148+
func GetClusterState(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig) (*ClusterState, error) {
149+
controlPlaneStackName := fmt.Sprintf(controlPlaneTemplate, *accessConfig.ClusterName)
150+
operatorStackName := fmt.Sprintf(operatorTemplate, *accessConfig.ClusterName)
151+
spotStackName := fmt.Sprintf(spotTemplate, *accessConfig.ClusterName)
152+
onDemandStackName := fmt.Sprintf(onDemandTemplate, *accessConfig.ClusterName)
139153

140-
stackSummaries, err := awsClient.ListEKSStacks(controlPlaneStackName, nodeGroupStackNames...)
154+
nodeGroupStackNamesSet := strset.New(operatorStackName, spotStackName, onDemandStackName)
155+
156+
stackSummaries, err := awsClient.ListEKSStacks(controlPlaneStackName, nodeGroupStackNamesSet)
141157
if err != nil {
142158
return nil, errors.Wrap(err, "unable to get cluster state from cloudformation")
143159
}
144160

145161
statusMap := map[string]string{}
146-
statusMap[controlPlaneStackName] = getStatusFromSummaries(stackSummaries, controlPlaneStackName)
162+
nodeGroupStackNames := []string{}
163+
var controlPlaneCreationTime time.Time
164+
165+
for _, stackSummary := range stackSummaries {
166+
statusMap[*stackSummary.StackName] = *stackSummary.StackStatus
167+
if *stackSummary.StackName == controlPlaneStackName {
168+
controlPlaneCreationTime = *stackSummary.CreationTime
169+
} else {
170+
nodeGroupStackNames = append(nodeGroupStackNames, *stackSummary.StackName)
171+
}
172+
}
147173

148-
for _, nodeGroupName := range nodeGroupStackNames {
149-
statusMap[nodeGroupName] = getStatusFromSummaries(stackSummaries, nodeGroupName)
174+
if _, ok := statusMap[controlPlaneStackName]; !ok {
175+
statusMap[controlPlaneStackName] = string(StatusNotFound)
176+
}
177+
178+
// add a timeout for situations where the control plane is listed in the cloudformation stacks but not the nodegroup stacks
179+
if !is(statusMap[controlPlaneStackName], string(StatusNotFound), cloudformation.StackStatusDeleteComplete) && len(nodeGroupStackNames) == 0 && time.Now().After(controlPlaneCreationTime.Add(30*time.Minute)) {
180+
statusMap[operatorStackName] = string(StatusCreateFailedTimedOut)
150181
}
151182

152183
status, err := getStatus(statusMap, controlPlaneStackName)
@@ -161,13 +192,3 @@ func GetClusterState(awsClient *aws.Client, clusterConfig *clusterconfig.Config)
161192
Status: status,
162193
}, nil
163194
}
164-
165-
func getStatusFromSummaries(stackSummaries []*cloudformation.StackSummary, stackName string) string {
166-
for _, stackSummary := range stackSummaries {
167-
if *stackSummary.StackName == stackName {
168-
return *stackSummary.StackStatus
169-
}
170-
}
171-
172-
return string(StatusNotFound)
173-
}

pkg/types/clusterstate/status.go

+8-7
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ package clusterstate
1919
type Status string
2020

2121
const (
22-
StatusNotFound Status = "not_found"
23-
StatusCreateInProgress Status = "create_in_progress"
24-
StatusCreateFailed Status = "create_failed"
25-
StatusCreateComplete Status = "create_complete"
26-
StatusDeleteInProgress Status = "delete_in_progress"
27-
StatusDeleteComplete Status = "delete_complete"
28-
StatusDeleteFailed Status = "delete_failed"
22+
StatusNotFound Status = "not_found"
23+
StatusCreateInProgress Status = "create_in_progress"
24+
StatusCreateFailed Status = "create_failed"
25+
StatusCreateComplete Status = "create_complete"
26+
StatusDeleteInProgress Status = "delete_in_progress"
27+
StatusDeleteComplete Status = "delete_complete"
28+
StatusDeleteFailed Status = "delete_failed"
29+
StatusCreateFailedTimedOut Status = "create_failed_timed_out"
2930
)

0 commit comments

Comments
 (0)