Skip to content

Commit b17c225

Browse files
authored
Delete API Gateway if cluster up fails (#1172)
1 parent 1f2390e commit b17c225

File tree

6 files changed

+165
-28
lines changed

6 files changed

+165
-28
lines changed

cli/cmd/cluster.go

+48-10
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ var _upCmd = &cobra.Command{
138138
exit.Error(err)
139139
}
140140

141-
err = CreateBucketIfNotFound(awsClient, clusterConfig.Bucket)
141+
err = createBucketIfNotFound(awsClient, clusterConfig.Bucket)
142142
if err != nil {
143143
exit.Error(err)
144144
}
@@ -147,7 +147,7 @@ var _upCmd = &cobra.Command{
147147
exit.Error(err)
148148
}
149149

150-
err = CreateLogGroupIfNotFound(awsClient, clusterConfig.LogGroup)
150+
err = createLogGroupIfNotFound(awsClient, clusterConfig.LogGroup)
151151
if err != nil {
152152
exit.Error(err)
153153
}
@@ -156,16 +156,25 @@ var _upCmd = &cobra.Command{
156156
exit.Error(err)
157157
}
158158

159-
err = createDashboard(awsClient, clusterConfig.ClusterName)
159+
err = createOrClearDashboard(awsClient, clusterConfig.ClusterName)
160+
if err != nil {
161+
exit.Error(err)
162+
}
163+
164+
err = createOrReplaceAPIGateway(awsClient, clusterConfig.ClusterName, clusterConfig.Tags)
160165
if err != nil {
161166
exit.Error(err)
162167
}
163168

164169
out, exitCode, err := runManagerUpdateCommand("/root/install.sh", clusterConfig, awsCreds, _flagClusterEnv)
165170
if err != nil {
171+
awsClient.DeleteAPIGatewayByTag(clusterconfig.ClusterNameTag, clusterConfig.ClusterName) // best effort deletion
172+
awsClient.DeleteVPCLinkByTag(clusterconfig.ClusterNameTag, clusterConfig.ClusterName) // best effort deletion
166173
exit.Error(err)
167174
}
168175
if exitCode == nil || *exitCode != 0 {
176+
awsClient.DeleteAPIGatewayByTag(clusterconfig.ClusterNameTag, clusterConfig.ClusterName) // best effort deletion
177+
awsClient.DeleteVPCLinkByTag(clusterconfig.ClusterNameTag, clusterConfig.ClusterName) // best effort deletion
169178
helpStr := "\nDebugging tips (may or may not apply to this error):"
170179
helpStr += fmt.Sprintf("\n* if your cluster started spinning up but was unable to provision instances, additional error information may be found in the activity history of your cluster's autoscaling groups (select each autoscaling group and click the \"Activity History\" tab): https://console.aws.amazon.com/ec2/autoscaling/home?region=%s#AutoScalingGroups:", *clusterConfig.Region)
171180
helpStr += fmt.Sprintf("\n* if your cluster started spinning up, please ensure that your CloudFormation stacks for this cluster have been fully deleted before trying to spin up this cluster again (you can delete your CloudFormation stacks from the AWS console: %s)", getCloudFormationURL(clusterConfig.ClusterName, *clusterConfig.Region))
@@ -323,11 +332,11 @@ var _downCmd = &cobra.Command{
323332
_, errAPIGateway := awsClient.DeleteAPIGatewayByTag(clusterconfig.ClusterNameTag, *accessConfig.ClusterName)
324333
_, errVPCLink := awsClient.DeleteVPCLinkByTag(clusterconfig.ClusterNameTag, *accessConfig.ClusterName)
325334
if errAPIGateway != nil {
326-
fmt.Print("\n\nunable to delete cortex's api gateway (see error below); if it still exists after the cluster has been deleted, please delete it manually via the api gateway console: https://console.aws.amazon.com/apigateway/main/apis\n")
335+
fmt.Printf("\n\nunable to delete cortex's api gateway (see error below); if it still exists after the cluster has been deleted, please delete it via the api gateway console: https://%s.console.aws.amazon.com/apigateway/main/apis\n", *accessConfig.Region)
327336
errors.PrintError(errAPIGateway)
328337
}
329338
if errVPCLink != nil {
330-
fmt.Print("\n\nunable to delete cortex's vpc link (see error below); if it still exists after the cluster has been deleted, please delete it manually via the api gateway console: https://console.aws.amazon.com/apigateway/main/vpc-links\n")
339+
fmt.Printf("\n\nunable to delete cortex's vpc link (see error below); if it still exists after the cluster has been deleted, please delete it via the api gateway console: https://%s.console.aws.amazon.com/apigateway/main/vpc-links\n", *accessConfig.Region)
331340
errors.PrintError(errVPCLink)
332341
}
333342
if errAPIGateway == nil && errVPCLink == nil {
@@ -339,7 +348,7 @@ var _downCmd = &cobra.Command{
339348
fmt.Print("○ deleting dashboard ")
340349
err = awsClient.DeleteDashboard(*accessConfig.ClusterName)
341350
if err != nil {
342-
fmt.Print("\n\nunable to delete cortex's api dashboard (see error below); if it still exists after the cluster has been deleted, please delete it manually via the cloudwatch console: https://console.aws.amazon.com/cloudwatch/home#dashboards:\n")
351+
fmt.Printf("\n\nunable to delete cortex's api dashboard (see error below); if it still exists after the cluster has been deleted, please delete it via the cloudwatch console: https://%s.console.aws.amazon.com/cloudwatch/home#dashboards:\n", *accessConfig.Region)
343352
errors.PrintError(err)
344353
fmt.Println()
345354
} else {
@@ -716,7 +725,7 @@ func getCloudFormationURLWithAccessConfig(accessConfig *clusterconfig.AccessConf
716725
return getCloudFormationURL(*accessConfig.ClusterName, *accessConfig.Region)
717726
}
718727

719-
func CreateBucketIfNotFound(awsClient *aws.Client, bucket string) error {
728+
func createBucketIfNotFound(awsClient *aws.Client, bucket string) error {
720729
bucketFound, err := awsClient.DoesBucketExist(bucket)
721730
if err != nil {
722731
return err
@@ -725,6 +734,7 @@ func CreateBucketIfNotFound(awsClient *aws.Client, bucket string) error {
725734
fmt.Print("○ creating a new s3 bucket: ", bucket)
726735
err = awsClient.CreateBucket(bucket)
727736
if err != nil {
737+
fmt.Print("\n\n")
728738
return err
729739
}
730740
fmt.Println(" ✓")
@@ -734,7 +744,7 @@ func CreateBucketIfNotFound(awsClient *aws.Client, bucket string) error {
734744
return nil
735745
}
736746

737-
func CreateLogGroupIfNotFound(awsClient *aws.Client, logGroup string) error {
747+
func createLogGroupIfNotFound(awsClient *aws.Client, logGroup string) error {
738748
logGroupFound, err := awsClient.DoesLogGroupExist(logGroup)
739749
if err != nil {
740750
return err
@@ -743,6 +753,7 @@ func CreateLogGroupIfNotFound(awsClient *aws.Client, logGroup string) error {
743753
fmt.Print("○ creating a new cloudwatch log group: ", logGroup)
744754
err = awsClient.CreateLogGroup(logGroup)
745755
if err != nil {
756+
fmt.Print("\n\n")
746757
return err
747758
}
748759
fmt.Println(" ✓")
@@ -753,8 +764,8 @@ func CreateLogGroupIfNotFound(awsClient *aws.Client, logGroup string) error {
753764
return nil
754765
}
755766

756-
// createDashboard creates a new dashboard (or clears an existing one if it already exists)
757-
func createDashboard(awsClient *aws.Client, dashboardName string) error {
767+
// createOrClearDashboard creates a new dashboard (or clears an existing one if it already exists)
768+
func createOrClearDashboard(awsClient *aws.Client, dashboardName string) error {
758769
dashboardFound, err := awsClient.DoesDashboardExist(dashboardName)
759770
if err != nil {
760771
return err
@@ -768,10 +779,37 @@ func createDashboard(awsClient *aws.Client, dashboardName string) error {
768779

769780
err = awsClient.CreateDashboard(dashboardName, consts.DashboardTitle)
770781
if err != nil {
782+
fmt.Print("\n\n")
771783
return err
772784
}
773785

774786
fmt.Println(" ✓")
775787

776788
return nil
777789
}
790+
791+
// createOrReplaceAPIGateway creates an API gateway for the cluster (or clears an existing one if it already exists)
792+
func createOrReplaceAPIGateway(awsClient *aws.Client, clusterName string, tags map[string]string) error {
793+
fmt.Print("○ creating api gateway: ", clusterName)
794+
795+
_, err := awsClient.DeleteVPCLinkByTag(clusterconfig.ClusterNameTag, clusterName)
796+
if err != nil {
797+
fmt.Print("\n\n")
798+
return errors.Append(err, fmt.Sprintf("\n\nunable to delete existing vpc link with tag %s=%s; please delete it via the api gateway console: https://%s.console.aws.amazon.com/apigateway/main/vpc-links", clusterconfig.ClusterNameTag, clusterName, awsClient.Region))
799+
}
800+
801+
_, err = awsClient.DeleteAPIGatewayByTag(clusterconfig.ClusterNameTag, clusterName)
802+
if err != nil {
803+
fmt.Print("\n\n")
804+
return errors.Append(err, fmt.Sprintf("\n\nunable to delete existing api gateway with tag %s=%s; please delete it via the api gateway console: https://%s.console.aws.amazon.com/apigateway/main/apis", clusterconfig.ClusterNameTag, clusterName, awsClient.Region))
805+
}
806+
807+
_, err = awsClient.CreateAPIGateway(clusterName, tags)
808+
if err != nil {
809+
fmt.Print("\n\n")
810+
return err
811+
}
812+
813+
fmt.Println(" ✓")
814+
return nil
815+
}

manager/get_api_gateway_endpoint.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import boto3
16+
import os
17+
18+
19+
def get_api_gateway_endpoint():
20+
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
21+
region = os.environ["CORTEX_REGION"]
22+
client_apigateway = boto3.client("apigatewayv2", region_name=region)
23+
24+
paginator = client_apigateway.get_paginator("get_apis")
25+
for api_gateway_page in paginator.paginate():
26+
for api_gateway in api_gateway_page["Items"]:
27+
if api_gateway["Tags"].get("cortex.dev/cluster-name") == cluster_name:
28+
return api_gateway["ApiEndpoint"]
29+
30+
raise Exception(
31+
f"your cluster's api gateway (in {region} with tag cortex.dev/cluster-name={cluster_name}) does not exist"
32+
)
33+
34+
35+
if __name__ == "__main__":
36+
print(get_api_gateway_endpoint(), end="")

manager/get_api_gateway_id.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import boto3
16+
import os
17+
18+
19+
def get_api_gateway_id():
20+
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
21+
region = os.environ["CORTEX_REGION"]
22+
client_apigateway = boto3.client("apigatewayv2", region_name=region)
23+
24+
paginator = client_apigateway.get_paginator("get_apis")
25+
for api_gateway_page in paginator.paginate():
26+
for api_gateway in api_gateway_page["Items"]:
27+
if api_gateway["Tags"].get("cortex.dev/cluster-name") == cluster_name:
28+
return api_gateway["ApiId"]
29+
30+
raise Exception(
31+
f"your cluster's api gateway (in {region} with tag cortex.dev/cluster-name={cluster_name}) does not exist"
32+
)
33+
34+
35+
if __name__ == "__main__":
36+
print(get_api_gateway_id(), end="")

manager/info.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ function get_api_load_balancer_endpoint() {
2525
}
2626

2727
function get_api_gateway_endpoint() {
28-
aws apigatewayv2 get-apis --region $CORTEX_REGION | jq ".Items[] | select(.Name == \"${CORTEX_CLUSTER_NAME}\") | .ApiEndpoint" | tr -d '"'
28+
python get_api_gateway_endpoint.py
2929
}
3030

3131
if ! eksctl utils describe-stacks --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION >/dev/null 2>&1; then

manager/install.sh

+1-12
Original file line numberDiff line numberDiff line change
@@ -163,18 +163,6 @@ function ensure_eks() {
163163
function main() {
164164
mkdir -p $CORTEX_CLUSTER_WORKSPACE
165165

166-
# create API Gateway
167-
if [ "$arg1" != "--update" ]; then
168-
create_api_output=$(aws apigatewayv2 create-api --tags $CORTEX_TAGS --region $CORTEX_REGION --name $CORTEX_CLUSTER_NAME --protocol-type HTTP)
169-
api_id=$(echo $create_api_output | jq .ApiId | tr -d '"')
170-
if [ "$api_id" = "" ] || [ "$api_id" = "null" ]; then
171-
echo -e "unable to extract api gateway ID from create-api output:\n$create_api_output"
172-
exit 1
173-
fi
174-
# create default stage; ignore error because default stage is supposed to be already created, but currently it isn't because of a possible bug in create-api
175-
aws apigatewayv2 create-stage --region $CORTEX_REGION --tags $CORTEX_TAGS --api-id $api_id --auto-deploy --stage-name \$default &>/dev/null || true
176-
fi
177-
178166
# create cluster (if it doesn't already exist)
179167
ensure_eks
180168

@@ -261,6 +249,7 @@ function main() {
261249
# add VPC Link integration to API Gateway
262250
if [ "$arg1" != "--update" ] && [ "$CORTEX_API_LOAD_BALANCER_SCHEME" == "internal" ]; then
263251
echo -n "○ creating api gateway vpc link integration "
252+
api_id=$(python get_api_gateway_id.py)
264253
python create_gateway_integration.py $api_id $vpc_link_id
265254
echo ""
266255
echo -n "○ waiting for api gateway vpc link integration "

pkg/lib/aws/apigateway.go

+43-5
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,34 @@ import (
2525
"github.com/cortexlabs/cortex/pkg/lib/errors"
2626
)
2727

28+
// CreateAPIGateway Creates a new API Gateway with the default stage
29+
func (c *Client) CreateAPIGateway(name string, tags map[string]string) (string, error) {
30+
createAPIResponse, err := c.APIGatewayV2().CreateApi(&apigatewayv2.CreateApiInput{
31+
Name: aws.String(name),
32+
ProtocolType: aws.String(apigatewayv2.ProtocolTypeHttp),
33+
Tags: aws.StringMap(tags),
34+
})
35+
if err != nil {
36+
return "", errors.Wrap(err, "failed to create api gateway")
37+
}
38+
if createAPIResponse.ApiId == nil {
39+
return "", errors.ErrorUnexpected("failed to create api gateway")
40+
}
41+
42+
_, err = c.APIGatewayV2().CreateStage(&apigatewayv2.CreateStageInput{
43+
ApiId: createAPIResponse.ApiId,
44+
AutoDeploy: aws.Bool(true),
45+
StageName: aws.String("$default"),
46+
Tags: aws.StringMap(tags),
47+
})
48+
if err != nil {
49+
c.DeleteAPIGateway(*createAPIResponse.ApiId) // best effort cleanup
50+
return "", errors.Wrap(err, "failed to create $default api gateway stage")
51+
}
52+
53+
return *createAPIResponse.ApiId, nil
54+
}
55+
2856
// GetVPCLinkByTag Gets a VPC Link by tag (returns nil if there are no matches)
2957
func (c *Client) GetVPCLinkByTag(tagName string, tagValue string) (*apigatewayv2.VpcLink, error) {
3058
var nextToken *string
@@ -111,20 +139,30 @@ func (c *Client) DeleteAPIGatewayByTag(tagName string, tagValue string) (*apigat
111139
return nil, nil
112140
}
113141

114-
// Delete mappings in case user added a custom domain name (otherwise this will block API Gateway deletion)
115-
err = c.DeleteAPIGatewayMappings(*apiGateway.ApiId)
142+
err = c.DeleteAPIGateway(*apiGateway.ApiId)
116143
if err != nil {
117144
return nil, err
118145
}
119146

147+
return apiGateway, nil
148+
}
149+
150+
// DeleteAPIGateway Deletes an API Gateway by ID (returns an error if the API Gateway does not exist)
151+
func (c *Client) DeleteAPIGateway(apiGatewayID string) error {
152+
// Delete mappings in case user added a custom domain name (otherwise this will block API Gateway deletion)
153+
err := c.DeleteAPIGatewayMappings(apiGatewayID)
154+
if err != nil {
155+
return err
156+
}
157+
120158
_, err = c.APIGatewayV2().DeleteApi(&apigatewayv2.DeleteApiInput{
121-
ApiId: apiGateway.ApiId,
159+
ApiId: aws.String(apiGatewayID),
122160
})
123161
if err != nil {
124-
return nil, errors.Wrap(err, "failed to delete api gateway "+*apiGateway.ApiId)
162+
return errors.Wrap(err, "failed to delete api gateway "+apiGatewayID)
125163
}
126164

127-
return apiGateway, nil
165+
return nil
128166
}
129167

130168
// DeleteAPIGatewayMappingsForDomainName deletes all API mappings that point to the provided api gateway from the provided domain name

0 commit comments

Comments
 (0)