From be39fb20cc51caf0d739fb5514254c979231fc3e Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Wed, 7 Mar 2018 10:01:12 +0100 Subject: [PATCH 1/7] [BUGFIX] do not break when TCP DNS answer exceeds 64k It will avoid having discovery broken when having large number of instances of a service (works with SRV and A* records). Fixes https://github.com/hashicorp/consul/issues/3850 --- agent/dns.go | 60 ++++++++++++++++++++++---------- agent/dns_test.go | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 18 deletions(-) diff --git a/agent/dns.go b/agent/dns.go index b809a2b3a318..5011fbbbf118 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -713,6 +713,32 @@ func syncExtra(index map[string]dns.RR, resp *dns.Msg) { resp.Extra = extra } +// trimTCPResponse limit the MaximumSize of messages to 64k as it is the limit +// of DNS responses +func trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { + hasExtra := len(resp.Extra) > 0 + maxSize := 65535 + + // We avoid some function calls and allocations by only handling the + // extra data when necessary. + var index map[string]dns.RR + if hasExtra { + index = make(map[string]dns.RR, len(resp.Extra)) + indexRRs(resp.Extra, index) + } + truncated := false + + // This enforces the given limit on 64k, the max limit for DNS messages + for len(resp.Answer) > 0 && resp.Len() > maxSize { + truncated = true + resp.Answer = resp.Answer[:len(resp.Answer)-1] + if hasExtra { + syncExtra(index, resp) + } + } + return truncated +} + // trimUDPResponse makes sure a UDP response is not longer than allowed by RFC // 1035. Enforce an arbitrary limit that can be further ratcheted down by // config, and then make sure the response doesn't exceed 512 bytes. Any extra @@ -765,6 +791,20 @@ func trimUDPResponse(req, resp *dns.Msg, udpAnswerLimit int) (trimmed bool) { return len(resp.Answer) < numAnswers } +// trimDNSResponse will trim the response for UDP and TCP +func (d *DNSServer) trimDNSResponse(network string, req, resp *dns.Msg) (trimmed bool) { + if network != "tcp" { + trimmed = trimUDPResponse(req, resp, d.config.UDPAnswerLimit) + } else { + trimmed = trimTCPResponse(req, resp) + } + // Flag that there are more records to return in the UDP response + if trimmed && d.config.EnableTruncate { + resp.Truncated = true + } + return trimmed +} + // lookupServiceNodes returns nodes with a given service. func (d *DNSServer) lookupServiceNodes(datacenter, service, tag string) (structs.IndexedCheckServiceNodes, error) { args := structs.ServiceSpecificRequest{ @@ -840,15 +880,7 @@ func (d *DNSServer) serviceLookup(network, datacenter, service, tag string, req, d.serviceNodeRecords(datacenter, out.Nodes, req, resp, ttl) } - // If the network is not TCP, restrict the number of responses - if network != "tcp" { - wasTrimmed := trimUDPResponse(req, resp, d.config.UDPAnswerLimit) - - // Flag that there are more records to return in the UDP response - if wasTrimmed && d.config.EnableTruncate { - resp.Truncated = true - } - } + d.trimDNSResponse(network, req, resp) // If the answer is empty and the response isn't truncated, return not found if len(resp.Answer) == 0 && !resp.Truncated { @@ -950,15 +982,7 @@ RPC: d.serviceNodeRecords(out.Datacenter, out.Nodes, req, resp, ttl) } - // If the network is not TCP, restrict the number of responses. - if network != "tcp" { - wasTrimmed := trimUDPResponse(req, resp, d.config.UDPAnswerLimit) - - // Flag that there are more records to return in the UDP response - if wasTrimmed && d.config.EnableTruncate { - resp.Truncated = true - } - } + d.trimDNSResponse(network, req, resp) // If the answer is empty and the response isn't truncated, return not found if len(resp.Answer) == 0 && !resp.Truncated { diff --git a/agent/dns_test.go b/agent/dns_test.go index d42abbeb7a06..18da89439fcd 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -2740,6 +2740,94 @@ func TestDNS_ServiceLookup_Randomize(t *testing.T) { } } +func TestDNS_TCP_and_UDP_Truncate(t *testing.T) { + t.Parallel() + a := NewTestAgent(t.Name(), ` + dns_config { + enable_truncate = true + } + `) + defer a.Shutdown() + + services := []string{"normal", "truncated"} + for index, service := range services { + numServices := (index * 5000) + 2 + for i := 1; i < numServices; i++ { + args := &structs.RegisterRequest{ + Datacenter: "dc1", + Node: fmt.Sprintf("%s-%d.acme.com", service, i), + Address: fmt.Sprintf("127.%d.%d.%d", index, (i / 255), i%255), + Service: &structs.NodeService{ + Service: service, + Port: 8000, + }, + } + + var out struct{} + if err := a.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + } + + // Register an equivalent prepared query. + var id string + { + args := &structs.PreparedQueryRequest{ + Datacenter: "dc1", + Op: structs.PreparedQueryCreate, + Query: &structs.PreparedQuery{ + Name: service, + Service: structs.ServiceQuery{ + Service: service, + }, + }, + } + if err := a.RPC("PreparedQuery.Apply", args, &id); err != nil { + t.Fatalf("err: %v", err) + } + } + + // Look up the service directly and via prepared query. Ensure the + // response is truncated each time. + questions := []string{ + fmt.Sprintf("%s.service.consul.", service), + id + ".query.consul.", + } + protocols := []string{ + "tcp", + "udp", + } + for _, qType := range []uint16{dns.TypeANY, dns.TypeA, dns.TypeSRV} { + for _, question := range questions { + for _, protocol := range protocols { + t.Run(fmt.Sprintf("lookup %s %s (qType:=%d)", question, protocol, qType), func(t *testing.T) { + m := new(dns.Msg) + m.SetQuestion(question, dns.TypeANY) + if protocol == "udp" { + m.SetEdns0(8192, true) + } + c := new(dns.Client) + c.Net = protocol + in, out, err := c.Exchange(m, a.DNSAddr()) + if err != nil && err != dns.ErrTruncated { + t.Fatalf("err: %v", err) + } + + // Check for the truncate bit + shouldBeTruncated := numServices > 4095 + + if shouldBeTruncated != in.Truncated { + info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) in %v", + service, question, protocol, numServices, out) + t.Fatalf("Should have truncate:=%v for %s", shouldBeTruncated, info) + } + }) + } + } + } + } +} + func TestDNS_ServiceLookup_Truncate(t *testing.T) { t.Parallel() a := NewTestAgent(t.Name(), ` From b77fd5ce9df2d1d46e27d91ecf9d385d21d6fa4f Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Wed, 7 Mar 2018 16:14:36 +0100 Subject: [PATCH 2/7] 64000 max limit to DNS messages since there is overhead Added debug log to give information about truncation. --- agent/dns.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/agent/dns.go b/agent/dns.go index 5011fbbbf118..21e9e67140d0 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -715,9 +715,10 @@ func syncExtra(index map[string]dns.RR, resp *dns.Msg) { // trimTCPResponse limit the MaximumSize of messages to 64k as it is the limit // of DNS responses -func trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { +func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { hasExtra := len(resp.Extra) > 0 - maxSize := 65535 + // There is some overhead, 65535 does not work + maxSize := 64000 // We avoid some function calls and allocations by only handling the // extra data when necessary. @@ -726,6 +727,8 @@ func trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { index = make(map[string]dns.RR, len(resp.Extra)) indexRRs(resp.Extra, index) } + originalSize := resp.Len() + originalNumRecords := len(resp.Answer) truncated := false // This enforces the given limit on 64k, the max limit for DNS messages @@ -736,6 +739,12 @@ func trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { syncExtra(index, resp) } } + if truncated { + d.logger.Printf("[DEBUG] dns: TCP answer to %v too large truncated recs:=%d/%d, size:=%d/%d", + req.Question, + len(resp.Answer), originalNumRecords, resp.Len(), originalSize) + + } return truncated } @@ -796,7 +805,7 @@ func (d *DNSServer) trimDNSResponse(network string, req, resp *dns.Msg) (trimmed if network != "tcp" { trimmed = trimUDPResponse(req, resp, d.config.UDPAnswerLimit) } else { - trimmed = trimTCPResponse(req, resp) + trimmed = d.trimTCPResponse(req, resp) } // Flag that there are more records to return in the UDP response if trimmed && d.config.EnableTruncate { From 7d59249d96e5e2d24c423024174d27618f7ad478 Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Wed, 7 Mar 2018 23:33:41 +0100 Subject: [PATCH 3/7] Avoid issue with compression of DNS messages causing overflow --- agent/dns.go | 7 ++++++- agent/dns_test.go | 47 +++++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/agent/dns.go b/agent/dns.go index 21e9e67140d0..2750ed6b02ad 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -718,7 +718,10 @@ func syncExtra(index map[string]dns.RR, resp *dns.Msg) { func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { hasExtra := len(resp.Extra) > 0 // There is some overhead, 65535 does not work - maxSize := 64000 + maxSize := 65533 // 64k - 2 bytes + // In order to compute properly, we have to avoid compress first + compressed := resp.Compress + resp.Compress = false // We avoid some function calls and allocations by only handling the // extra data when necessary. @@ -745,6 +748,8 @@ func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { len(resp.Answer), originalNumRecords, resp.Len(), originalSize) } + // Restore compression if any + resp.Compress = compressed return truncated } diff --git a/agent/dns_test.go b/agent/dns_test.go index 18da89439fcd..cf9571de05cc 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -2800,28 +2800,31 @@ func TestDNS_TCP_and_UDP_Truncate(t *testing.T) { for _, qType := range []uint16{dns.TypeANY, dns.TypeA, dns.TypeSRV} { for _, question := range questions { for _, protocol := range protocols { - t.Run(fmt.Sprintf("lookup %s %s (qType:=%d)", question, protocol, qType), func(t *testing.T) { - m := new(dns.Msg) - m.SetQuestion(question, dns.TypeANY) - if protocol == "udp" { - m.SetEdns0(8192, true) - } - c := new(dns.Client) - c.Net = protocol - in, out, err := c.Exchange(m, a.DNSAddr()) - if err != nil && err != dns.ErrTruncated { - t.Fatalf("err: %v", err) - } - - // Check for the truncate bit - shouldBeTruncated := numServices > 4095 - - if shouldBeTruncated != in.Truncated { - info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) in %v", - service, question, protocol, numServices, out) - t.Fatalf("Should have truncate:=%v for %s", shouldBeTruncated, info) - } - }) + for _, compress := range []bool{true, false} { + t.Run(fmt.Sprintf("lookup %s %s (qType:=%d) compressed=%b", question, protocol, qType, compress), func(t *testing.T) { + m := new(dns.Msg) + m.SetQuestion(question, dns.TypeANY) + if protocol == "udp" { + m.SetEdns0(8192, true) + } + c := new(dns.Client) + c.Net = protocol + m.Compress = compress + in, out, err := c.Exchange(m, a.DNSAddr()) + if err != nil && err != dns.ErrTruncated { + t.Fatalf("err: %v", err) + } + + // Check for the truncate bit + shouldBeTruncated := numServices > 4095 + + if shouldBeTruncated != in.Truncated { + info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) in %v", + service, question, protocol, numServices, out) + t.Fatalf("Should have truncate:=%v for %s", shouldBeTruncated, info) + } + }) + } } } } From ce3f47a75d20989a87f7fd515a9a7000d95b133a Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Thu, 8 Mar 2018 00:26:41 +0100 Subject: [PATCH 4/7] Performance optimization for services having more than 2k records --- agent/dns.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/agent/dns.go b/agent/dns.go index 2750ed6b02ad..73d6a30a00b6 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -726,12 +726,19 @@ func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { // We avoid some function calls and allocations by only handling the // extra data when necessary. var index map[string]dns.RR + originalSize := resp.Len() + originalNumRecords := len(resp.Answer) + + // Beyond 2500 records, performance gets bad + // Limit the number of records at once, anyway, it won't fit in 64k + // For SRV Records, the max is around 500 records, for A, less than 2k + if len(resp.Answer) > 2048 { + resp.Answer = resp.Answer[:2048] + } if hasExtra { index = make(map[string]dns.RR, len(resp.Extra)) indexRRs(resp.Extra, index) } - originalSize := resp.Len() - originalNumRecords := len(resp.Answer) truncated := false // This enforces the given limit on 64k, the max limit for DNS messages @@ -746,7 +753,6 @@ func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { d.logger.Printf("[DEBUG] dns: TCP answer to %v too large truncated recs:=%d/%d, size:=%d/%d", req.Question, len(resp.Answer), originalNumRecords, resp.Len(), originalSize) - } // Restore compression if any resp.Compress = compressed From d0e45f22dfc981ad3bcc61935b50c68da30b764c Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Thu, 8 Mar 2018 00:36:17 +0100 Subject: [PATCH 5/7] Fixed wrong format of debug msg in unit test --- agent/dns_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/dns_test.go b/agent/dns_test.go index cf9571de05cc..06216759d2ad 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -2801,7 +2801,7 @@ func TestDNS_TCP_and_UDP_Truncate(t *testing.T) { for _, question := range questions { for _, protocol := range protocols { for _, compress := range []bool{true, false} { - t.Run(fmt.Sprintf("lookup %s %s (qType:=%d) compressed=%b", question, protocol, qType, compress), func(t *testing.T) { + t.Run(fmt.Sprintf("lookup %s %s (qType:=%d) compressed=%v", question, protocol, qType, compress), func(t *testing.T) { m := new(dns.Msg) m.SetQuestion(question, dns.TypeANY) if protocol == "udp" { From 93fa1f6f492914dac9739ded1fdf0f39bfb16d53 Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Fri, 9 Mar 2018 18:25:29 +0100 Subject: [PATCH 6/7] Optimize size for SRV records, should improve performance a bit Stricter Unit tests that checks if truncation was OK. --- agent/dns.go | 8 ++++++-- agent/dns_test.go | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/agent/dns.go b/agent/dns.go index 73d6a30a00b6..2b1845b54915 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -732,8 +732,12 @@ func (d *DNSServer) trimTCPResponse(req, resp *dns.Msg) (trimmed bool) { // Beyond 2500 records, performance gets bad // Limit the number of records at once, anyway, it won't fit in 64k // For SRV Records, the max is around 500 records, for A, less than 2k - if len(resp.Answer) > 2048 { - resp.Answer = resp.Answer[:2048] + truncateAt := 2048 + if req.Question[0].Qtype == dns.TypeSRV { + truncateAt = 640 + } + if len(resp.Answer) > truncateAt { + resp.Answer = resp.Answer[:truncateAt] } if hasExtra { index = make(map[string]dns.RR, len(resp.Extra)) diff --git a/agent/dns_test.go b/agent/dns_test.go index 06216759d2ad..8de23969889a 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -2818,9 +2818,9 @@ func TestDNS_TCP_and_UDP_Truncate(t *testing.T) { // Check for the truncate bit shouldBeTruncated := numServices > 4095 - if shouldBeTruncated != in.Truncated { - info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) in %v", - service, question, protocol, numServices, out) + if shouldBeTruncated != in.Truncated || len(in.Answer) > 2000 || len(in.Answer) < 1 || in.Len() > 65535 { + info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) sz:= %d in %v", + service, question, protocol, numServices, len(in.Answer), out) t.Fatalf("Should have truncate:=%v for %s", shouldBeTruncated, info) } }) From aebfcb6767023f0fceaa812ae36b2d7a8043acdb Mon Sep 17 00:00:00 2001 From: Pierre Souchay Date: Fri, 9 Mar 2018 18:42:13 +0100 Subject: [PATCH 7/7] Fixed minor typo (+ travis tests is unstable) --- agent/dns_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/dns_test.go b/agent/dns_test.go index 8de23969889a..46a5d08ec64f 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -2821,7 +2821,7 @@ func TestDNS_TCP_and_UDP_Truncate(t *testing.T) { if shouldBeTruncated != in.Truncated || len(in.Answer) > 2000 || len(in.Answer) < 1 || in.Len() > 65535 { info := fmt.Sprintf("service %s question:=%s (%s) (%d total records) sz:= %d in %v", service, question, protocol, numServices, len(in.Answer), out) - t.Fatalf("Should have truncate:=%v for %s", shouldBeTruncated, info) + t.Fatalf("Should have truncated:=%v for %s", shouldBeTruncated, info) } }) }