Skip to content

Commit b8dd267

Browse files
authored
Add support for Classic load balancer (ELB) for the API load balancer (#2413)
1 parent 241565b commit b8dd267

22 files changed

+1306
-87
lines changed

cli/cmd/cluster.go

+55-13
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/aws/aws-sdk-go/service/autoscaling"
3030
"github.com/aws/aws-sdk-go/service/ec2"
3131
"github.com/aws/aws-sdk-go/service/eks"
32+
"github.com/aws/aws-sdk-go/service/elb"
3233
"github.com/aws/aws-sdk-go/service/elbv2"
3334
"github.com/aws/aws-sdk-go/service/s3"
3435
"github.com/cortexlabs/cortex/cli/cluster"
@@ -302,7 +303,7 @@ var _clusterUpCmd = &cobra.Command{
302303
exit.Error(ErrorClusterUp(out + helpStr))
303304
}
304305

305-
loadBalancer, err := getLoadBalancer(clusterConfig.ClusterName, OperatorLoadBalancer, awsClient)
306+
loadBalancer, err := getNLBLoadBalancer(clusterConfig.ClusterName, OperatorLoadBalancer, awsClient)
306307
if err != nil {
307308
exit.Error(errors.Append(err, fmt.Sprintf("\n\nyou can attempt to resolve this issue and configure your cli environment by running `cortex cluster info --configure-env %s`", envName)))
308309
}
@@ -522,7 +523,7 @@ var _clusterDownCmd = &cobra.Command{
522523
}
523524

524525
// updating CLI env is best-effort, so ignore errors
525-
loadBalancer, _ := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
526+
loadBalancer, _ := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
526527

527528
fmt.Print("○ deleting sqs queues ... ")
528529
numDeleted, err := awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(accessConfig.ClusterName))
@@ -735,7 +736,7 @@ var _clusterExportCmd = &cobra.Command{
735736
exit.Error(err)
736737
}
737738

738-
loadBalancer, err := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
739+
loadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
739740
if err != nil {
740741
exit.Error(err)
741742
}
@@ -881,17 +882,27 @@ func cmdPrintConfig(awsClient *awslib.Client, accessConfig *clusterconfig.Access
881882
func cmdInfo(awsClient *awslib.Client, accessConfig *clusterconfig.AccessConfig, stacks clusterstate.ClusterStacks, outputType flags.OutputType, disallowPrompt bool) {
882883
clusterConfig := refreshCachedClusterConfig(awsClient, accessConfig, outputType == flags.PrettyOutputType)
883884

884-
operatorLoadBalancer, err := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
885+
operatorLoadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
885886
if err != nil {
886887
exit.Error(err)
887888
}
888-
apiLoadBalancer, err := getLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
889-
if err != nil {
890-
exit.Error(err)
891-
}
892-
893889
operatorEndpoint := s.EnsurePrefix(*operatorLoadBalancer.DNSName, "https://")
894-
apiEndpoint := *apiLoadBalancer.DNSName
890+
891+
var apiEndpoint string
892+
if clusterConfig.APILoadBalancerType == clusterconfig.NLBLoadBalancerType {
893+
apiLoadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
894+
if err != nil {
895+
exit.Error(err)
896+
}
897+
apiEndpoint = *apiLoadBalancer.DNSName
898+
}
899+
if clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType {
900+
apiLoadBalancer, err := getELBLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
901+
if err != nil {
902+
exit.Error(err)
903+
}
904+
apiEndpoint = *apiLoadBalancer.DNSName
905+
}
895906

896907
if outputType == flags.JSONOutputType || outputType == flags.YAMLOutputType {
897908
infoResponse, err := getInfoOperatorResponse(operatorEndpoint)
@@ -983,8 +994,17 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
983994
prometheusEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
984995
metricsEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
985996
nlbPrice := awslib.NLBMetadatas[clusterConfig.Region].Price
997+
elbPrice := awslib.ELBMetadatas[clusterConfig.Region].Price
986998
natUnitPrice := awslib.NATMetadatas[clusterConfig.Region].Price
987999

1000+
var loadBalancersPrice float64
1001+
usesELBForAPILoadBalancer := clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType
1002+
if usesELBForAPILoadBalancer {
1003+
loadBalancersPrice = nlbPrice + elbPrice
1004+
} else {
1005+
loadBalancersPrice = 2 * nlbPrice
1006+
}
1007+
9881008
headers := []table.Header{
9891009
{Title: "aws resource"},
9901010
{Title: "cost per hour"},
@@ -1033,12 +1053,17 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
10331053
} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
10341054
natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
10351055
}
1036-
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
1056+
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + loadBalancersPrice + natTotalPrice
10371057
fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
10381058

10391059
rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice) + " total"})
10401060
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
1041-
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
1061+
if usesELBForAPILoadBalancer {
1062+
rows = append(rows, []interface{}{"1 network load balancer", s.DollarsMaxPrecision(nlbPrice)})
1063+
rows = append(rows, []interface{}{"1 classic load balancer", s.DollarsMaxPrecision(elbPrice)})
1064+
} else {
1065+
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(loadBalancersPrice) + " total"})
1066+
}
10421067

10431068
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
10441069
rows = append(rows, []interface{}{"1 nat gateway", s.DollarsMaxPrecision(natUnitPrice)})
@@ -1366,7 +1391,24 @@ func (lb LoadBalancer) String() string {
13661391
}
13671392

13681393
// Will return error if the load balancer can't be found
1369-
func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elbv2.LoadBalancer, error) {
1394+
func getNLBLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elbv2.LoadBalancer, error) {
1395+
loadBalancer, err := awsClient.FindLoadBalancerV2(map[string]string{
1396+
clusterconfig.ClusterNameTag: clusterName,
1397+
"cortex.dev/load-balancer": whichLB.String(),
1398+
})
1399+
if err != nil {
1400+
return nil, errors.Wrap(err, fmt.Sprintf("unable to locate %s load balancer", whichLB.String()))
1401+
}
1402+
1403+
if loadBalancer == nil {
1404+
return nil, ErrorNoOperatorLoadBalancer(whichLB.String())
1405+
}
1406+
1407+
return loadBalancer, nil
1408+
}
1409+
1410+
// Will return error if the load balancer can't be found
1411+
func getELBLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elb.LoadBalancerDescription, error) {
13701412
loadBalancer, err := awsClient.FindLoadBalancer(map[string]string{
13711413
clusterconfig.ClusterNameTag: clusterName,
13721414
"cortex.dev/load-balancer": whichLB.String(),

cli/cmd/lib_cluster_config.go

+16-2
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,17 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
169169
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
170170
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
171171
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
172+
elbPrice := aws.ELBMetadatas[clusterConfig.Region].Price
172173
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
173174

175+
var loadBalancersPrice float64
176+
usesELBForAPILoadBalancer := clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType
177+
if usesELBForAPILoadBalancer {
178+
loadBalancersPrice = nlbPrice + elbPrice
179+
} else {
180+
loadBalancersPrice = 2 * nlbPrice
181+
}
182+
174183
var natTotalPrice float64
175184
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
176185
natTotalPrice = natUnitPrice
@@ -187,7 +196,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
187196
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
188197

189198
ngNameToSpotInstancesUsed := map[string]int{}
190-
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
199+
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + loadBalancersPrice + natTotalPrice
191200
totalMinPrice := fixedPrice
192201
totalMaxPrice := fixedPrice
193202
for _, ng := range clusterConfig.NodeGroups {
@@ -236,7 +245,12 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
236245
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
237246
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice) + " total"})
238247
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
239-
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
248+
if usesELBForAPILoadBalancer {
249+
rows = append(rows, []interface{}{"1 network load balancer", s.DollarsMaxPrecision(nlbPrice)})
250+
rows = append(rows, []interface{}{"1 classic load balancer", s.DollarsMaxPrecision(elbPrice)})
251+
} else {
252+
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(loadBalancersPrice) + " total"})
253+
}
240254

241255
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
242256
rows = append(rows, []interface{}{"1 nat gateway", s.DollarsMaxPrecision(natUnitPrice)})

docs/clusters/management/create.md

+3
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ subnet_visibility: public
5858
# NAT gateway (required when using private subnets) [none | single | highly_available (a NAT gateway per availability zone)]
5959
nat_gateway: none
6060

61+
# API load balancer type [nlb | elb]
62+
api_load_balancer_type: nlb
63+
6164
# API load balancer scheme [internet-facing | internal]
6265
api_load_balancer_scheme: internet-facing
6366

docs/clusters/management/production.md

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ operator_load_balancer_cidr_white_list: [0.0.0.0/0]
4545

4646
See [here](../networking/load-balancers.md) for more information about the load balancers.
4747

48+
### Workload load-balancing
49+
50+
Depending on your application's requirements, you might have different needs from the cluster's api load balancer. By default, the api load balancer is a [Network load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/introduction.html) (NLB). In some situations, a [Classic load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/introduction.html) (ELB) may be preferred, and can be selected in your cluster config by setting `api_load_balancer_type: elb`. This selection can only be made before creating your cluster.
51+
4852
### Ensure node provisioning
4953

5054
You can take advantage of the cost savings of spot instances and the reliability of on-demand instances by utilizing the `priority` field in node groups. You can deploy two node groups, one that is spot and another that is on-demand. Set the priority of the spot node group to be higher than the priority of the on-demand node group. This encourages the cluster-autoscaler to try to spin up instances from the spot node group first. If there are no more spot instances available, the on-demand node group will be used instead.

docs/clusters/networking/load-balancers.md

+2
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ All APIs share a single API load balancer. By default, the API load balancer is
77
The SSL certificate on the API load balancer is autogenerated during installation using `localhost` as the Common Name (CN). Therefore, clients will need to skip certificate verification when making HTTPS requests to your APIs (e.g. `curl -k https://***`), or make HTTP requests instead (e.g. `curl http://***`). Alternatively, you can enable HTTPS by using a [custom domain](custom-domain.md) and setting up [https](https.md) or by [creating an API Gateway](api-gateway.md) to forward requests to your API load balancer.
88

99
There is a separate load balancer for the Cortex operator. By default, the operator load balancer is public. You can configure your operator load balancer to be private by setting `operator_load_balancer_scheme: internal` in your cluster configuration file (before creating your cluster). You can use [VPC Peering](vpc-peering.md) to enable your Cortex CLI to connect to your cluster operator from another VPC. You can enforce that incoming requests to the Cortex operator must originate from specific ip address ranges by specifying `operator_load_balancer_cidr_white_list: [<CIDR list>]` in your cluster configuration.
10+
11+
By default, the API load balancer and Operator load balancer are both [Network load balancers](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/introduction.html) (NLB). The api load balancer can be configured as a [Classic load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/introduction.html) (ELB) instead if desired. The API load balancer type must be specified before creating your cluster.

docs/overview.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Cortex uses the Kubernetes Cluster Autoscaler to scale the appropriate node grou
1414

1515
By default, a new dedicated VPC is created for the cluster during installation.
1616

17-
Two network load balancers (NLBs) are created to route traffic to the cluster. One load balancer is dedicated for traffic to your APIs, and the other load balancer is dedicated for API management requests to Cortex from your CLI or Python client. Traffic to the load balancers can be secured and restricted based on your cluster configuration.
17+
Two AWS load balancers are created to route traffic to the cluster. One load balancer is dedicated for traffic to your APIs, and the other load balancer is dedicated for API management requests to Cortex from your CLI or Python client. Traffic to the load balancers can be secured and restricted based on your cluster configuration.
1818

1919
### Observability
2020

manager/get_api_load_balancer_state.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,22 @@
1515
import boto3
1616
import os
1717

18-
from helpers import get_api_load_balancer
18+
from helpers import get_api_load_balancer_v2, get_api_load_balancer, get_api_load_balancer_health
1919

2020

2121
def get_api_load_balancer_state():
2222
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
2323
region = os.environ["CORTEX_REGION"]
24-
25-
client_elbv2 = boto3.client("elbv2", region_name=region)
26-
27-
load_balancer = get_api_load_balancer(cluster_name, client_elbv2)
28-
return load_balancer["State"]["Code"]
24+
load_balancer_type = os.environ["CORTEX_API_LOAD_BALANCER_TYPE"]
25+
26+
if load_balancer_type == "nlb":
27+
client_elbv2 = boto3.client("elbv2", region_name=region)
28+
load_balancer = get_api_load_balancer_v2(cluster_name, client_elbv2)
29+
return load_balancer["State"]["Code"]
30+
else:
31+
client_elb = boto3.client("elb", region_name=region)
32+
load_balancer = get_api_load_balancer(cluster_name, client_elb)
33+
return get_api_load_balancer_health(load_balancer["LoadBalancerName"], client_elb)
2934

3035

3136
if __name__ == "__main__":

manager/get_operator_load_balancer_state.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import boto3
1616
import os
1717

18-
from helpers import get_operator_load_balancer
18+
from helpers import get_operator_load_balancer_v2
1919

2020

2121
def get_operator_load_balancer_state():
@@ -24,7 +24,7 @@ def get_operator_load_balancer_state():
2424

2525
client_elbv2 = boto3.client("elbv2", region_name=region)
2626

27-
load_balancer = get_operator_load_balancer(cluster_name, client_elbv2)
27+
load_balancer = get_operator_load_balancer_v2(cluster_name, client_elbv2)
2828
return load_balancer["State"]["Code"]
2929

3030

manager/get_operator_target_group_status.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import os
1717
import json
1818

19-
from helpers import get_operator_load_balancer
19+
from helpers import get_operator_load_balancer_v2
2020

2121

2222
def get_operator_target_group_status():
@@ -25,7 +25,7 @@ def get_operator_target_group_status():
2525

2626
client_elbv2 = boto3.client("elbv2", region_name=region)
2727

28-
load_balancer_arn = get_operator_load_balancer(cluster_name, client_elbv2)["LoadBalancerArn"]
28+
load_balancer_arn = get_operator_load_balancer_v2(cluster_name, client_elbv2)["LoadBalancerArn"]
2929
target_group_arn = get_load_balancer_https_target_group_arn(load_balancer_arn, client_elbv2)
3030
return get_target_health(target_group_arn, client_elbv2)
3131

manager/helpers.py

+43-5
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,29 @@
1313
# limitations under the License.
1414

1515

16-
def get_operator_load_balancer(cluster_name, client_elbv2):
17-
return _get_load_balancer("operator", cluster_name, client_elbv2)
16+
def get_operator_load_balancer_v2(cluster_name, client_elbv2):
17+
return _get_load_balancer_v2("operator", cluster_name, client_elbv2)
1818

1919

20-
def get_api_load_balancer(cluster_name, client_elbv2):
21-
return _get_load_balancer("api", cluster_name, client_elbv2)
20+
def get_api_load_balancer_v2(cluster_name, client_elbv2):
21+
return _get_load_balancer_v2("api", cluster_name, client_elbv2)
2222

2323

24-
def _get_load_balancer(load_balancer_tag, cluster_name, client_elbv2):
24+
def get_api_load_balancer(cluster_name, client_elb):
25+
return _get_load_balancer("api", cluster_name, client_elb)
26+
27+
28+
def get_api_load_balancer_health(load_balancer_name, client_elb):
29+
instance_health = client_elb.describe_instance_health(
30+
LoadBalancerName=load_balancer_name,
31+
)
32+
for instance_state in instance_health["InstanceStates"]:
33+
if instance_state["State"] != "InService":
34+
return "inactive"
35+
return "active"
36+
37+
38+
def _get_load_balancer_v2(load_balancer_tag, cluster_name, client_elbv2):
2539
paginator = client_elbv2.get_paginator("describe_load_balancers")
2640
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
2741
load_balancers = {
@@ -43,3 +57,27 @@ def _get_load_balancer(load_balancer_tag, cluster_name, client_elbv2):
4357
return load_balancers[tag_description["ResourceArn"]]
4458

4559
raise Exception(f"unable to find {load_balancer_tag} load balancer")
60+
61+
62+
def _get_load_balancer(load_balancer_tag, cluster_name, client_elb):
63+
paginator = client_elb.get_paginator("describe_load_balancers")
64+
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
65+
load_balancers = {
66+
load_balancer["LoadBalancerName"]: load_balancer
67+
for load_balancer in load_balancer_page["LoadBalancerDescriptions"]
68+
}
69+
tag_descriptions = client_elb.describe_tags(LoadBalancerNames=list(load_balancers.keys()))[
70+
"TagDescriptions"
71+
]
72+
for tag_description in tag_descriptions:
73+
foundClusterNameTag = False
74+
foundLoadBalancerTag = False
75+
for tags in tag_description["Tags"]:
76+
if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name:
77+
foundClusterNameTag = True
78+
if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == load_balancer_tag:
79+
foundLoadBalancerTag = True
80+
if foundClusterNameTag and foundLoadBalancerTag:
81+
return load_balancers[tag_description["LoadBalancerName"]]
82+
83+
raise Exception(f"unable to find {load_balancer_tag} load balancer")

manager/manifests/istio.yaml.j2

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ spec:
109109
istio: ingressgateway-apis
110110
k8s:
111111
serviceAnnotations:
112-
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
112+
service.beta.kubernetes.io/aws-load-balancer-type: "{{ env['CORTEX_API_LOAD_BALANCER_TYPE'] }}"
113113
service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
114114
service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags: "{{ env['CORTEX_API_LOAD_BALANCER_TAGS'] }}"
115115
service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"

0 commit comments

Comments
 (0)