Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Classic load balancer (ELB) for the API load balancer #2413

Merged
merged 11 commits into from
Jan 7, 2022
68 changes: 55 additions & 13 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/eks"
"github.com/aws/aws-sdk-go/service/elb"
"github.com/aws/aws-sdk-go/service/elbv2"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/cortexlabs/cortex/cli/cluster"
Expand Down Expand Up @@ -302,7 +303,7 @@ var _clusterUpCmd = &cobra.Command{
exit.Error(ErrorClusterUp(out + helpStr))
}

loadBalancer, err := getLoadBalancer(clusterConfig.ClusterName, OperatorLoadBalancer, awsClient)
loadBalancer, err := getNLBLoadBalancer(clusterConfig.ClusterName, OperatorLoadBalancer, awsClient)
if err != nil {
exit.Error(errors.Append(err, fmt.Sprintf("\n\nyou can attempt to resolve this issue and configure your cli environment by running `cortex cluster info --configure-env %s`", envName)))
}
Expand Down Expand Up @@ -522,7 +523,7 @@ var _clusterDownCmd = &cobra.Command{
}

// updating CLI env is best-effort, so ignore errors
loadBalancer, _ := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
loadBalancer, _ := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)

fmt.Print("○ deleting sqs queues ... ")
numDeleted, err := awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(accessConfig.ClusterName))
Expand Down Expand Up @@ -735,7 +736,7 @@ var _clusterExportCmd = &cobra.Command{
exit.Error(err)
}

loadBalancer, err := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
loadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
if err != nil {
exit.Error(err)
}
Expand Down Expand Up @@ -881,17 +882,27 @@ func cmdPrintConfig(awsClient *awslib.Client, accessConfig *clusterconfig.Access
func cmdInfo(awsClient *awslib.Client, accessConfig *clusterconfig.AccessConfig, stacks clusterstate.ClusterStacks, outputType flags.OutputType, disallowPrompt bool) {
clusterConfig := refreshCachedClusterConfig(awsClient, accessConfig, outputType == flags.PrettyOutputType)

operatorLoadBalancer, err := getLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
operatorLoadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, OperatorLoadBalancer, awsClient)
if err != nil {
exit.Error(err)
}
apiLoadBalancer, err := getLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
if err != nil {
exit.Error(err)
}

operatorEndpoint := s.EnsurePrefix(*operatorLoadBalancer.DNSName, "https://")
apiEndpoint := *apiLoadBalancer.DNSName

var apiEndpoint string
if clusterConfig.APILoadBalancerType == clusterconfig.NLBLoadBalancerType {
apiLoadBalancer, err := getNLBLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
if err != nil {
exit.Error(err)
}
apiEndpoint = *apiLoadBalancer.DNSName
}
if clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType {
apiLoadBalancer, err := getELBLoadBalancer(accessConfig.ClusterName, APILoadBalancer, awsClient)
if err != nil {
exit.Error(err)
}
apiEndpoint = *apiLoadBalancer.DNSName
}

if outputType == flags.JSONOutputType || outputType == flags.YAMLOutputType {
infoResponse, err := getInfoOperatorResponse(operatorEndpoint)
Expand Down Expand Up @@ -983,8 +994,17 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
prometheusEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
metricsEBSPrice := awslib.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
nlbPrice := awslib.NLBMetadatas[clusterConfig.Region].Price
elbPrice := awslib.ELBMetadatas[clusterConfig.Region].Price
natUnitPrice := awslib.NATMetadatas[clusterConfig.Region].Price

var loadBalancersPrice float64
usesELBForAPILoadBalancer := clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType
if usesELBForAPILoadBalancer {
loadBalancersPrice = nlbPrice + elbPrice
} else {
loadBalancersPrice = 2 * nlbPrice
}

headers := []table.Header{
{Title: "aws resource"},
{Title: "cost per hour"},
Expand Down Expand Up @@ -1033,12 +1053,17 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
}
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + loadBalancersPrice + natTotalPrice
fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))

rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice) + " total"})
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
if usesELBForAPILoadBalancer {
rows = append(rows, []interface{}{"1 network load balancer", s.DollarsMaxPrecision(nlbPrice)})
rows = append(rows, []interface{}{"1 classic load balancer", s.DollarsMaxPrecision(elbPrice)})
} else {
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(loadBalancersPrice) + " total"})
}

if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
rows = append(rows, []interface{}{"1 nat gateway", s.DollarsMaxPrecision(natUnitPrice)})
Expand Down Expand Up @@ -1366,7 +1391,24 @@ func (lb LoadBalancer) String() string {
}

// Will return error if the load balancer can't be found
func getLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elbv2.LoadBalancer, error) {
func getNLBLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elbv2.LoadBalancer, error) {
loadBalancer, err := awsClient.FindLoadBalancerV2(map[string]string{
clusterconfig.ClusterNameTag: clusterName,
"cortex.dev/load-balancer": whichLB.String(),
})
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("unable to locate %s load balancer", whichLB.String()))
}

if loadBalancer == nil {
return nil, ErrorNoOperatorLoadBalancer(whichLB.String())
}

return loadBalancer, nil
}

// Will return error if the load balancer can't be found
func getELBLoadBalancer(clusterName string, whichLB LoadBalancer, awsClient *awslib.Client) (*elb.LoadBalancerDescription, error) {
loadBalancer, err := awsClient.FindLoadBalancer(map[string]string{
clusterconfig.ClusterNameTag: clusterName,
"cortex.dev/load-balancer": whichLB.String(),
Expand Down
18 changes: 16 additions & 2 deletions cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,17 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
elbPrice := aws.ELBMetadatas[clusterConfig.Region].Price
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price

var loadBalancersPrice float64
usesELBForAPILoadBalancer := clusterConfig.APILoadBalancerType == clusterconfig.ELBLoadBalancerType
if usesELBForAPILoadBalancer {
loadBalancersPrice = nlbPrice + elbPrice
} else {
loadBalancersPrice = 2 * nlbPrice
}

var natTotalPrice float64
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
natTotalPrice = natUnitPrice
Expand All @@ -187,7 +196,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})

ngNameToSpotInstancesUsed := map[string]int{}
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + loadBalancersPrice + natTotalPrice
totalMinPrice := fixedPrice
totalMaxPrice := fixedPrice
for _, ng := range clusterConfig.NodeGroups {
Expand Down Expand Up @@ -236,7 +245,12 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice) + " total"})
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
if usesELBForAPILoadBalancer {
rows = append(rows, []interface{}{"1 network load balancer", s.DollarsMaxPrecision(nlbPrice)})
rows = append(rows, []interface{}{"1 classic load balancer", s.DollarsMaxPrecision(elbPrice)})
} else {
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(loadBalancersPrice) + " total"})
}

if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
rows = append(rows, []interface{}{"1 nat gateway", s.DollarsMaxPrecision(natUnitPrice)})
Expand Down
3 changes: 3 additions & 0 deletions docs/clusters/management/create.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ subnet_visibility: public
# NAT gateway (required when using private subnets) [none | single | highly_available (a NAT gateway per availability zone)]
nat_gateway: none

# API load balancer type [nlb | elb]
api_load_balancer_type: nlb

# API load balancer scheme [internet-facing | internal]
api_load_balancer_scheme: internet-facing

Expand Down
4 changes: 4 additions & 0 deletions docs/clusters/management/production.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ operator_load_balancer_cidr_white_list: [0.0.0.0/0]

See [here](../networking/load-balancers.md) for more information about the load balancers.

### Workload load-balancing

Depending on your application's requirements, you might have different needs from the cluster's api load balancer. By default, the api load balancer is a [Network load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/introduction.html) (NLB). In some situations, a [Classic load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/introduction.html) (ELB) may be preferred, and can be selected in your cluster config by setting `api_load_balancer_type: elb`. This selection can only be made before creating your cluster.

### Ensure node provisioning

You can take advantage of the cost savings of spot instances and the reliability of on-demand instances by utilizing the `priority` field in node groups. You can deploy two node groups, one that is spot and another that is on-demand. Set the priority of the spot node group to be higher than the priority of the on-demand node group. This encourages the cluster-autoscaler to try to spin up instances from the spot node group first. If there are no more spot instances available, the on-demand node group will be used instead.
Expand Down
2 changes: 2 additions & 0 deletions docs/clusters/networking/load-balancers.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ All APIs share a single API load balancer. By default, the API load balancer is
The SSL certificate on the API load balancer is autogenerated during installation using `localhost` as the Common Name (CN). Therefore, clients will need to skip certificate verification when making HTTPS requests to your APIs (e.g. `curl -k https://***`), or make HTTP requests instead (e.g. `curl http://***`). Alternatively, you can enable HTTPS by using a [custom domain](custom-domain.md) and setting up [https](https.md) or by [creating an API Gateway](api-gateway.md) to forward requests to your API load balancer.

There is a separate load balancer for the Cortex operator. By default, the operator load balancer is public. You can configure your operator load balancer to be private by setting `operator_load_balancer_scheme: internal` in your cluster configuration file (before creating your cluster). You can use [VPC Peering](vpc-peering.md) to enable your Cortex CLI to connect to your cluster operator from another VPC. You can enforce that incoming requests to the Cortex operator must originate from specific ip address ranges by specifying `operator_load_balancer_cidr_white_list: [<CIDR list>]` in your cluster configuration.

By default, the API load balancer and Operator load balancer are both [Network load balancers](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/introduction.html) (NLB). The api load balancer can be configured as a [Classic load balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/introduction.html) (ELB) instead if desired. The API load balancer type must be specified before creating your cluster.
2 changes: 1 addition & 1 deletion docs/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Cortex uses the Kubernetes Cluster Autoscaler to scale the appropriate node grou

By default, a new dedicated VPC is created for the cluster during installation.

Two network load balancers (NLBs) are created to route traffic to the cluster. One load balancer is dedicated for traffic to your APIs, and the other load balancer is dedicated for API management requests to Cortex from your CLI or Python client. Traffic to the load balancers can be secured and restricted based on your cluster configuration.
Two AWS load balancers are created to route traffic to the cluster. One load balancer is dedicated for traffic to your APIs, and the other load balancer is dedicated for API management requests to Cortex from your CLI or Python client. Traffic to the load balancers can be secured and restricted based on your cluster configuration.

### Observability

Expand Down
17 changes: 11 additions & 6 deletions manager/get_api_load_balancer_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,22 @@
import boto3
import os

from helpers import get_api_load_balancer
from helpers import get_api_load_balancer_v2, get_api_load_balancer, get_api_load_balancer_health


def get_api_load_balancer_state():
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
region = os.environ["CORTEX_REGION"]

client_elbv2 = boto3.client("elbv2", region_name=region)

load_balancer = get_api_load_balancer(cluster_name, client_elbv2)
return load_balancer["State"]["Code"]
load_balancer_type = os.environ["CORTEX_API_LOAD_BALANCER_TYPE"]

if load_balancer_type == "nlb":
client_elbv2 = boto3.client("elbv2", region_name=region)
load_balancer = get_api_load_balancer_v2(cluster_name, client_elbv2)
return load_balancer["State"]["Code"]
else:
client_elb = boto3.client("elb", region_name=region)
load_balancer = get_api_load_balancer(cluster_name, client_elb)
return get_api_load_balancer_health(load_balancer["LoadBalancerName"], client_elb)


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions manager/get_operator_load_balancer_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import boto3
import os

from helpers import get_operator_load_balancer
from helpers import get_operator_load_balancer_v2


def get_operator_load_balancer_state():
Expand All @@ -24,7 +24,7 @@ def get_operator_load_balancer_state():

client_elbv2 = boto3.client("elbv2", region_name=region)

load_balancer = get_operator_load_balancer(cluster_name, client_elbv2)
load_balancer = get_operator_load_balancer_v2(cluster_name, client_elbv2)
return load_balancer["State"]["Code"]


Expand Down
4 changes: 2 additions & 2 deletions manager/get_operator_target_group_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import json

from helpers import get_operator_load_balancer
from helpers import get_operator_load_balancer_v2


def get_operator_target_group_status():
Expand All @@ -25,7 +25,7 @@ def get_operator_target_group_status():

client_elbv2 = boto3.client("elbv2", region_name=region)

load_balancer_arn = get_operator_load_balancer(cluster_name, client_elbv2)["LoadBalancerArn"]
load_balancer_arn = get_operator_load_balancer_v2(cluster_name, client_elbv2)["LoadBalancerArn"]
target_group_arn = get_load_balancer_https_target_group_arn(load_balancer_arn, client_elbv2)
return get_target_health(target_group_arn, client_elbv2)

Expand Down
48 changes: 43 additions & 5 deletions manager/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,29 @@
# limitations under the License.


def get_operator_load_balancer(cluster_name, client_elbv2):
return _get_load_balancer("operator", cluster_name, client_elbv2)
def get_operator_load_balancer_v2(cluster_name, client_elbv2):
return _get_load_balancer_v2("operator", cluster_name, client_elbv2)


def get_api_load_balancer(cluster_name, client_elbv2):
return _get_load_balancer("api", cluster_name, client_elbv2)
def get_api_load_balancer_v2(cluster_name, client_elbv2):
return _get_load_balancer_v2("api", cluster_name, client_elbv2)


def _get_load_balancer(load_balancer_tag, cluster_name, client_elbv2):
def get_api_load_balancer(cluster_name, client_elb):
return _get_load_balancer("api", cluster_name, client_elb)


def get_api_load_balancer_health(load_balancer_name, client_elb):
instance_health = client_elb.describe_instance_health(
LoadBalancerName=load_balancer_name,
)
for instance_state in instance_health["InstanceStates"]:
if instance_state["State"] != "InService":
return "inactive"
return "active"


def _get_load_balancer_v2(load_balancer_tag, cluster_name, client_elbv2):
paginator = client_elbv2.get_paginator("describe_load_balancers")
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
load_balancers = {
Expand All @@ -43,3 +57,27 @@ def _get_load_balancer(load_balancer_tag, cluster_name, client_elbv2):
return load_balancers[tag_description["ResourceArn"]]

raise Exception(f"unable to find {load_balancer_tag} load balancer")


def _get_load_balancer(load_balancer_tag, cluster_name, client_elb):
paginator = client_elb.get_paginator("describe_load_balancers")
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
load_balancers = {
load_balancer["LoadBalancerName"]: load_balancer
for load_balancer in load_balancer_page["LoadBalancerDescriptions"]
}
tag_descriptions = client_elb.describe_tags(LoadBalancerNames=list(load_balancers.keys()))[
"TagDescriptions"
]
for tag_description in tag_descriptions:
foundClusterNameTag = False
foundLoadBalancerTag = False
for tags in tag_description["Tags"]:
if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name:
foundClusterNameTag = True
if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == load_balancer_tag:
foundLoadBalancerTag = True
if foundClusterNameTag and foundLoadBalancerTag:
return load_balancers[tag_description["LoadBalancerName"]]

raise Exception(f"unable to find {load_balancer_tag} load balancer")
2 changes: 1 addition & 1 deletion manager/manifests/istio.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ spec:
istio: ingressgateway-apis
k8s:
serviceAnnotations:
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
service.beta.kubernetes.io/aws-load-balancer-type: "{{ env['CORTEX_API_LOAD_BALANCER_TYPE'] }}"
service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags: "{{ env['CORTEX_API_LOAD_BALANCER_TAGS'] }}"
service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
Expand Down
Loading