Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NET-6581] perf: Fetch services once rather than per-node on deregister #3322

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/3322.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
control-plane: reduce Consul Catalog API requests required for endpoints reconcile in large clusters
```
Original file line number Diff line number Diff line change
Expand Up @@ -933,70 +933,68 @@ func getHealthCheckStatusReason(healthCheckStatus, podName, podNamespace string)
// has addresses, it will only deregister instances not in the map.
func (r *Controller) deregisterService(apiClient *api.Client, k8sSvcName, k8sSvcNamespace string, endpointsAddressesMap map[string]bool) error {
// Get services matching metadata from Consul
nodesWithSvcs, err := r.serviceInstancesForNodes(apiClient, k8sSvcName, k8sSvcNamespace)
serviceInstances, err := r.serviceInstances(apiClient, k8sSvcName, k8sSvcNamespace)
if err != nil {
r.Log.Error(err, "failed to get service instances", "name", k8sSvcName)
return err
}

var errs error
for _, nodeSvcs := range nodesWithSvcs {
for _, svc := range nodeSvcs.Services {
for _, svc := range serviceInstances {
// We need to get services matching "k8s-service-name" and "k8s-namespace" metadata.
// If we selectively deregister, only deregister if the address is not in the map. Otherwise, deregister
// every service instance.
var serviceDeregistered bool
if endpointsAddressesMap != nil {
if _, ok := endpointsAddressesMap[svc.Address]; !ok {
if _, ok := endpointsAddressesMap[svc.ServiceAddress]; !ok {
// If the service address is not in the Endpoints addresses, deregister it.
r.Log.Info("deregistering service from consul", "svc", svc.ID)
r.Log.Info("deregistering service from consul", "svc", svc.ServiceID)
_, err := apiClient.Catalog().Deregister(&api.CatalogDeregistration{
Node: nodeSvcs.Node.Node,
ServiceID: svc.ID,
Node: svc.Node,
ServiceID: svc.ServiceID,
Namespace: svc.Namespace,
}, nil)
if err != nil {
// Do not exit right away as there might be other services that need to be deregistered.
r.Log.Error(err, "failed to deregister service instance", "id", svc.ID)
r.Log.Error(err, "failed to deregister service instance", "id", svc.ServiceID)
errs = multierror.Append(errs, err)
} else {
serviceDeregistered = true
}
}
} else {
r.Log.Info("deregistering service from consul", "svc", svc.ID)
r.Log.Info("deregistering service from consul", "svc", svc.ServiceID)
_, err := apiClient.Catalog().Deregister(&api.CatalogDeregistration{
Node: nodeSvcs.Node.Node,
ServiceID: svc.ID,
Node: svc.Node,
ServiceID: svc.ServiceID,
Namespace: svc.Namespace,
}, nil)
if err != nil {
// Do not exit right away as there might be other services that need to be deregistered.
r.Log.Error(err, "failed to deregister service instance", "id", svc.ID)
r.Log.Error(err, "failed to deregister service instance", "id", svc.ServiceID)
errs = multierror.Append(errs, err)
} else {
serviceDeregistered = true
}
}

if r.AuthMethod != "" && serviceDeregistered {
r.Log.Info("reconciling ACL tokens for service", "svc", svc.Service)
err := r.deleteACLTokensForServiceInstance(apiClient, svc, k8sSvcNamespace, svc.Meta[constants.MetaKeyPodName], svc.Meta[constants.MetaKeyPodUID])
r.Log.Info("reconciling ACL tokens for service", "svc", svc.ServiceName)
err := r.deleteACLTokensForServiceInstance(apiClient, svc, k8sSvcNamespace, svc.ServiceMeta[constants.MetaKeyPodName], svc.ServiceMeta[constants.MetaKeyPodUID])
if err != nil {
r.Log.Error(err, "failed to reconcile ACL tokens for service", "svc", svc.Service)
r.Log.Error(err, "failed to reconcile ACL tokens for service", "svc", svc.ServiceName)
errs = multierror.Append(errs, err)
}
}

if serviceDeregistered {
err = r.deregisterNode(apiClient, nodeSvcs.Node.Node)
err = r.deregisterNode(apiClient, svc.Node)
if err != nil {
r.Log.Error(err, "failed to deregister node", "svc", svc.Service)
r.Log.Error(err, "failed to deregister node", "svc", svc.ServiceName)
errs = multierror.Append(errs, err)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the above code is unchanged. The GH diff is easier to read with ?w=1.

The changes are:

  • Field swaps between the old and new API response objects (see conversion fn here, called in the old endpoint implementation here)
    • CatalogNodeServiceList.Node.Node -> ServiceNode.Node
    • NodeService.ID -> ServiceNode.ServiceID
    • NodeService.Service -> ServiceNode.ServiceName
    • NodeService.Meta -> ServiceNode.ServiceMeta
    • NodeService.Address -> ServiceNode.ServiceAddress
  • Raising the inner loop / removing the per-node outer loop

Here's a visual of the diff of the two models read locally from a multi-node kind cluster (the sidecar proxy is similar and carries identical NodeMeta and ServiceMeta, which is what we filter on):
image

}
}
}
}

return errs

Expand Down Expand Up @@ -1036,7 +1034,7 @@ func (r *Controller) deregisterNode(apiClient *api.Client, nodeName string) erro
// deleteACLTokensForServiceInstance finds the ACL tokens that belongs to the service instance and deletes it from Consul.
// It will only check for ACL tokens that have been created with the auth method this controller
// has been configured with and will only delete tokens for the provided podName and podUID.
func (r *Controller) deleteACLTokensForServiceInstance(apiClient *api.Client, svc *api.AgentService, k8sNS, podName, podUID string) error {
func (r *Controller) deleteACLTokensForServiceInstance(apiClient *api.Client, svc *api.CatalogService, k8sNS, podName, podUID string) error {
// Skip if podName is empty.
if podName == "" {
return nil
Expand All @@ -1049,7 +1047,7 @@ func (r *Controller) deleteACLTokensForServiceInstance(apiClient *api.Client, sv
// matches as well.
tokens, _, err := apiClient.ACL().TokenListFiltered(
api.ACLTokenFilterOptions{
ServiceName: svc.Service,
ServiceName: svc.ServiceName,
},
&api.QueryOptions{
Namespace: svc.Namespace,
Expand All @@ -1064,7 +1062,7 @@ func (r *Controller) deleteACLTokensForServiceInstance(apiClient *api.Client, sv
// * have a single service identity whose service name is the same as 'svc.Service'
if token.AuthMethod == r.AuthMethod &&
len(token.ServiceIdentities) == 1 &&
token.ServiceIdentities[0].ServiceName == svc.Service {
token.ServiceIdentities[0].ServiceName == svc.ServiceName {
tokenMeta, err := getTokenMetaFromDescription(token.Description)
if err != nil {
return fmt.Errorf("failed to parse token metadata: %s", err)
Expand Down Expand Up @@ -1162,49 +1160,100 @@ func getTokenMetaFromDescription(description string) (map[string]string, error)
return tokenMeta, nil
}

func (r *Controller) serviceInstancesForNodes(apiClient *api.Client, k8sServiceName, k8sServiceNamespace string) ([]*api.CatalogNodeServiceList, error) {
var serviceList []*api.CatalogNodeServiceList
func (r *Controller) serviceInstances(apiClient *api.Client, k8sServiceName, k8sServiceNamespace string) ([]*api.CatalogService, error) {
var (
instances []*api.CatalogService
errs error
)

// The nodelist may have changed between this point and when the event was raised
// For example, if a pod is evicted because a node has been deleted, there is no guarantee that that node will show up here
// query consul catalog for a list of nodes supporting this service
// quite a lot of results as synthetic nodes are never deregistered.
var nodes []*api.Node
filter := fmt.Sprintf(`Meta[%q] == %q `, "synthetic-node", "true")
nodes, _, err := apiClient.Catalog().Nodes(&api.QueryOptions{Filter: filter, Namespace: namespaces.WildcardNamespace})
// Get the names of services that have the provided k8sServiceName and k8sServiceNamespace in their metadata.
// This is necessary so that we can then list the service instances for each Consul service name, which may
// not match the K8s service name.
services, err := r.servicesForK8SServiceNameAndNamespace(apiClient, k8sServiceName, k8sServiceNamespace)
if err != nil {
r.Log.Error(err, "failed to get catalog services", "name", k8sServiceName)
return nil, err
}

var errs error
for _, node := range nodes {
var nodeServices *api.CatalogNodeServiceList
nodeServices, err := r.serviceInstancesForK8SServiceNameAndNamespace(apiClient, k8sServiceName, k8sServiceNamespace, node.Node)
// Query consul catalog for a list of service instances matching the given service names.
filter := fmt.Sprintf(`NodeMeta[%q] == %q and ServiceMeta[%q] == %q and ServiceMeta[%q] == %q and ServiceMeta[%q] == %q`,
zalimeni marked this conversation as resolved.
Show resolved Hide resolved
metaKeySyntheticNode, "true",
metaKeyKubeServiceName, k8sServiceName,
constants.MetaKeyKubeNS, k8sServiceNamespace,
metaKeyManagedBy, constants.ManagedByValue)
for _, service := range services {
var is []*api.CatalogService
// Always query the default NS. This ensures that we include mesh gateways, which are always registered to the default NS.
// It also ensures that during migrations that enable namespaces, we deregister old service instances in the default NS.
// An alternative approach to this dual query would be a service instance list using the wildcard NS, which would also
// include instances in Consul namespaces that are no longer in use (e.g. configuration change); this capability does
// not currently exist in Consul's catalog API (as of 1.17) and would need to first be added.
//
// This request uses the service index of the services table (does not perform a full table scan), then decorates each
// result with a single node fetched by ID index from the nodes table.
is, _, err = apiClient.Catalog().Service(service, "", &api.QueryOptions{Filter: filter})
if err != nil {
errs = multierror.Append(errs, err)
} else {
serviceList = append(serviceList, nodeServices)
instances = append(instances, is...)
}
// If namespaces are enabled a non-default NS is targeted, also query by target Consul NS.
if r.EnableConsulNamespaces {
nonDefaultNamespace := namespaces.NonDefaultConsulNamespace(r.consulNamespace(k8sServiceNamespace))
if nonDefaultNamespace != "" {
hashi-derek marked this conversation as resolved.
Show resolved Hide resolved
is, _, err = apiClient.Catalog().Service(service, "", &api.QueryOptions{Filter: filter, Namespace: nonDefaultNamespace})
if err != nil {
errs = multierror.Append(errs, err)
} else {
instances = append(instances, is...)
}
}
}
}

return serviceList, errs
return instances, errs
}

// serviceInstancesForK8SServiceNameAndNamespace calls Consul's ServicesWithFilter to get the list
// of services instances that have the provided k8sServiceName and k8sServiceNamespace in their metadata.
func (r *Controller) serviceInstancesForK8SServiceNameAndNamespace(apiClient *api.Client, k8sServiceName, k8sServiceNamespace, nodeName string) (*api.CatalogNodeServiceList, error) {
// servicesForK8SServiceNameAndNamespace calls Consul's Services to get the list
// of services that have the provided k8sServiceName and k8sServiceNamespace in their metadata.
func (r *Controller) servicesForK8SServiceNameAndNamespace(apiClient *api.Client, k8sServiceName, k8sServiceNamespace string) ([]string, error) {
var (
serviceList *api.CatalogNodeServiceList
services map[string][]string
err error
)
filter := fmt.Sprintf(`Meta[%q] == %q and Meta[%q] == %q and Meta[%q] == %q`,
filter := fmt.Sprintf(`ServiceMeta[%q] == %q and ServiceMeta[%q] == %q and ServiceMeta[%q] == %q`,
Comment on lines -1200 to +1224
Copy link
Member Author

@zalimeni zalimeni Dec 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same filter, different model field. I checked agent/consul/catalog_endpoint.go and Meta is not currently used in an index lookup in the previously used /nodes and /node-services/{node} Catalog API endpoints; both are a bexp post-filter. This means that we are not trading an indexed query for a non-indexed one when it comes to the synthetic node or service name filtering.

If we find the new queries are not performant enough, we have the option of adding an index for ServiceMeta or a new purpose-fit endpoint.

metaKeyKubeServiceName, k8sServiceName, constants.MetaKeyKubeNS, k8sServiceNamespace, metaKeyManagedBy, constants.ManagedByValue)
// Always query the default NS. This ensures that we cover CE->Ent upgrades where services were previously
// in the default NS, as well as mesh gateways, which are always registered to the default NS.
//
// This request performs a NS-bound scan of the services table. If needed in the future, its performance
// could be improved by adding an index on ServiceMeta to Consul's state store.
services, _, err = apiClient.Catalog().Services(&api.QueryOptions{Filter: filter})
if err != nil {
return nil, err
}
// If namespaces are enabled a non-default NS is targeted, also query by target Consul NS.
if r.EnableConsulNamespaces {
serviceList, _, err = apiClient.Catalog().NodeServiceList(nodeName, &api.QueryOptions{Filter: filter, Namespace: namespaces.WildcardNamespace})
} else {
serviceList, _, err = apiClient.Catalog().NodeServiceList(nodeName, &api.QueryOptions{Filter: filter})
nonDefaultNamespace := namespaces.NonDefaultConsulNamespace(r.consulNamespace(k8sServiceNamespace))
if nonDefaultNamespace != "" {
ss, _, err := apiClient.Catalog().Services(&api.QueryOptions{Filter: filter, Namespace: nonDefaultNamespace})
if err != nil {
return nil, err
}
// Add to existing map to deduplicate.
for s := range ss {
services[s] = nil // We don't use the tags, so just set to nil
}
}
}

// Return just the service name keys (we don't need the tags)
// https://developer.hashicorp.com/consul/api-docs/catalog#list-services
var serviceNames []string
for s := range services {
serviceNames = append(serviceNames, s)
}
return serviceList, err
return serviceNames, err
}

// processPreparedQueryUpstream processes an upstream in the format:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1954,7 +1954,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
{
ID: "mesh-gateway",
Kind: api.ServiceKindMeshGateway,
Service: "mesh-gateway",
Service: consulSvcName,
Port: 80,
Address: "1.2.3.4",
Meta: map[string]string{
Expand Down Expand Up @@ -1985,7 +1985,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
{
ID: "mesh-gateway",
Kind: api.ServiceKindMeshGateway,
Service: "mesh-gateway",
Service: consulSvcName,
Port: 80,
Address: "1.2.3.4",
Meta: map[string]string{
Expand Down Expand Up @@ -2016,7 +2016,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
{
ID: "terminating-gateway",
Kind: api.ServiceKindTerminatingGateway,
Service: "terminating-gateway",
Service: consulSvcName,
Port: 8443,
Address: "1.2.3.4",
Meta: map[string]string{
Expand All @@ -2037,7 +2037,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
{
ID: "terminating-gateway",
Kind: api.ServiceKindTerminatingGateway,
Service: "terminating-gateway",
Service: consulSvcName,
Port: 8443,
Address: "1.2.3.4",
Meta: map[string]string{
Expand Down Expand Up @@ -2089,7 +2089,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
{
ID: "ingress-gateway",
Kind: api.ServiceKindIngressGateway,
Service: "ingress-gateway",
Service: consulSvcName,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and above: this test was specifying the wrong service name vs. the one checked in assertions that instances are deregistered. This turns the false negatives into true negatives.

Port: 80,
Address: "1.2.3.4",
Meta: map[string]string{
Expand Down Expand Up @@ -2184,6 +2184,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
ReleaseName: "consul",
ReleaseNamespace: "default",
EnableConsulNamespaces: true,
ConsulDestinationNamespace: ts.ConsulNS,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This last line addition of ConsulDestinationNamespace was needed so that the replacement of the wildcard NS search w/ a specific NS would simulate deregistration for the non-default NS in use.

}
if tt.enableACLs {
ep.AuthMethod = test.AuthMethod
Expand Down Expand Up @@ -2216,6 +2217,7 @@ func TestReconcileDeleteGatewayWithNamespaces(t *testing.T) {
}

token, _, err = consulClient.ACL().TokenRead(token.AccessorID, queryOpts)
require.Error(t, err)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Panic fix when test fails on no error

require.Contains(t, err.Error(), "ACL not found", token)
}
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4575,7 +4575,7 @@ func TestServiceInstancesForK8SServiceNameAndNamespace(t *testing.T) {
name string
k8sServiceNameMeta string
k8sNamespaceMeta string
expected []*api.AgentService
expected []*api.CatalogService
}{
{
"no k8s service name or namespace meta",
Expand All @@ -4599,22 +4599,21 @@ func TestServiceInstancesForK8SServiceNameAndNamespace(t *testing.T) {
"both k8s service name and namespace set",
k8sSvc,
k8sNS,
[]*api.AgentService{
[]*api.CatalogService{
{
ID: "foo1",
Service: "foo",
Meta: map[string]string{"k8s-service-name": k8sSvc, "k8s-namespace": k8sNS},
ServiceName: "foo",
ServiceMeta: map[string]string{"k8s-service-name": k8sSvc, "k8s-namespace": k8sNS},
},
{
Kind: api.ServiceKindConnectProxy,
ID: "foo1-proxy",
Service: "foo-sidecar-proxy",
Port: 20000,
Proxy: &api.AgentServiceConnectProxyConfig{
ServiceName: "foo-sidecar-proxy",
ServicePort: 20000,
ServiceProxy: &api.AgentServiceConnectProxyConfig{
DestinationServiceName: "foo",
DestinationServiceID: "foo1",
},
Meta: map[string]string{"k8s-service-name": k8sSvc, "k8s-namespace": k8sNS},
ServiceMeta: map[string]string{"k8s-service-name": k8sSvc, "k8s-namespace": k8sNS},
},
},
},
Expand Down Expand Up @@ -4683,14 +4682,14 @@ func TestServiceInstancesForK8SServiceNameAndNamespace(t *testing.T) {
}
ep := Controller{}

svcs, err := ep.serviceInstancesForK8SServiceNameAndNamespace(consulClient, k8sSvc, k8sNS, consulNodeName)
svcs, err := ep.serviceInstances(consulClient, k8sSvc, k8sNS)
require.NoError(t, err)
if len(svcs.Services) > 0 {
if len(svcs) > 0 {
require.Len(t, svcs, 2)
require.NotNil(t, c.expected[0], svcs.Services[0])
require.Equal(t, c.expected[0].Service, svcs.Services[0].Service)
require.NotNil(t, c.expected[1], svcs.Services[1])
require.Equal(t, c.expected[1].Service, svcs.Services[1].Service)
require.NotNil(t, svcs[0], c.expected[0])
require.Equal(t, c.expected[0].ServiceName, svcs[0].ServiceName)
require.NotNil(t, svcs[1], c.expected[1])
require.Equal(t, c.expected[1].ServiceName, svcs[1].ServiceName)
Comment on lines +4689 to +4692
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Swapped the nil checks here since in that call, the first argument is what should be verified, not the expected value (informational)

}
})
}
Expand Down
9 changes: 9 additions & 0 deletions control-plane/namespaces/namespaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,12 @@ func ConsulNamespace(kubeNS string, enableConsulNamespaces bool, consulDestNS st

return consulDestNS
}

// NonDefaultConsulNamespace returns the given Consul namespace if it is not default or empty.
// Otherwise, it returns the empty string.
func NonDefaultConsulNamespace(consulNS string) string {
if consulNS == "" || consulNS == DefaultNamespace {
return ""
}
return consulNS
}