Skip to content

Commit

Permalink
optimize DNS client performance
Browse files Browse the repository at this point in the history
Signed-off-by: ii2day <ji.li@daocloud.io>
  • Loading branch information
ii2day committed May 23, 2023
1 parent 0a82158 commit e8c354c
Show file tree
Hide file tree
Showing 24 changed files with 338 additions and 1,157 deletions.
1 change: 1 addition & 0 deletions charts/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ data:
nethttp_defaultMaxIdleConnsPerHost: {{ .Values.feature.nethttp_defaultMaxIdleConnsPerHost }}
nethttp_defaultRequest_DurationInSecond: {{ .Values.feature.nethttp_defaultRequest_DurationInSecond }}
nethttp_defaultRequest_PerRequestTimeoutInMS: {{ .Values.feature.nethttp_defaultRequest_PerRequestTimeoutInMS }}
netdns_defaultConcurrency: {{ .Values.feature.netdns_defaultConcurrency }}
multusPodAnnotationKey: {{ .Values.feature.multusPodAnnotationKey }}
crdMaxHistory: {{ .Values.feature.crdMaxHistory }}
{{- if .Values.feature.enableIPv4 }}
Expand Down
3 changes: 3 additions & 0 deletions charts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ feature:
## @param feature.nethttp_defaultFail_MeanDelayInMs mean delay in ms for kind nethttp
nethttp_defaultFail_MeanDelayInMs: 2000

## @param feature.netdns_defaultConcurrency concurrency for kind netdns
netdns_defaultConcurrency: 50

## @param feature.taskPollIntervalInSecond the interval to poll the task in controller and agent pod
taskPollIntervalInSecond: 5

Expand Down
2 changes: 0 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ require (
go.opentelemetry.io/otel/metric v0.34.0
go.opentelemetry.io/otel/sdk v1.11.1
go.opentelemetry.io/otel/sdk/metric v0.33.0
go.uber.org/ratelimit v0.2.0
go.uber.org/zap v1.24.0
golang.org/x/net v0.10.0
google.golang.org/grpc v1.53.0
Expand All @@ -48,7 +47,6 @@ require (
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver/v3 v3.1.1 // indirect
github.com/Masterminds/sprig/v3 v3.2.2 // indirect
github.com/andres-erbsen/clock v0.0.0-20160526145045-9e14626cd129 // indirect
github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
Expand Down
4 changes: 0 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/andres-erbsen/clock v0.0.0-20160526145045-9e14626cd129 h1:MzBOUgng9orim59UnfUTLRjMpd09C5uEVQ6RPGeCaVI=
github.com/andres-erbsen/clock v0.0.0-20160526145045-9e14626cd129/go.mod h1:rFgpPQZYZ8vdbc+48xibu8ALc3yeyd64IhHS+PU6Yyg=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/asaskevich/govalidator v0.0.0-20200907205600-7a23bdc65eef/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d h1:Byv0BzEl3/e6D5CLfI0j/7hiIEtvGVFPCZ7Ei2oq8iQ=
Expand Down Expand Up @@ -538,8 +536,6 @@ go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0
go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
go.uber.org/multierr v1.8.0 h1:dg6GjLku4EH+249NNmoIciG9N/jURbDG+pFlTkhzIC8=
go.uber.org/multierr v1.8.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak=
go.uber.org/ratelimit v0.2.0 h1:UQE2Bgi7p2B85uP5dC2bbRtig0C+OeNRnNEafLjsLPA=
go.uber.org/ratelimit v0.2.0/go.mod h1:YYBV4e4naJvhpitQrWJu1vCpgB7CboMe0qhltKt6mUg=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
Expand Down
240 changes: 32 additions & 208 deletions pkg/loadRequest/loadDns/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
package loadDns

import (
"context"
"fmt"
"github.com/miekg/dns"
"github.com/spidernet-io/spiderdoctor/pkg/lock"
"github.com/spidernet-io/spiderdoctor/pkg/utils/stats"
"go.uber.org/ratelimit"
"go.uber.org/zap"
"sync"
"time"

config "github.com/spidernet-io/spiderdoctor/pkg/types"
)

type RequestProtocol string
Expand All @@ -38,218 +35,45 @@ type DnsRequestData struct {
DurationInSecond int
}

// ------------------

type DelayMetric struct {
// Mean is the mean request latency.
Mean string `json:"mean"`
// P50 is the 50th percentile request latency.
P50 string `json:"50th"`
// P90 is the 90th percentile request latency.
P90 string `json:"90th"`
// P95 is the 95th percentile request latency.
P95 string `json:"95th"`
// P99 is the 99th percentile request latency.
P99 string `json:"99th"`
// Max is the maximum observed request latency.
Max string `json:"max"`
// Min is the minimum observed request latency.
Min string `json:"min"`
}

// final metric
type DnsMetrics struct {
StartTime time.Time
EndTime time.Time
Duration string

TargetDomain string
DnsServer string
DnsMethod string

// succeed to query the ip
SucceedCount int
// failed to get response , or not get ip in the dns response
FailedCount int
TotalCount int
SuccessRate float64

// when succeed to get response
ReplyCode map[string]int
// error to send request, such as timeout
ErrorMap map[string]int

DnsAnswer []dns.RR

// delay information for success request
DelayForSuccess DelayMetric
}

// metric for one request
type dnsMetric struct {
e error
rtt time.Duration
msg *dns.Msg
}

func executeRequestOnce(c *dns.Client, conn *dns.Conn, msg *dns.Msg) *dnsMetric {
r := dnsMetric{}
r.msg, r.rtt, r.e = c.ExchangeWithConn(msg, conn)
return &r
}

func ParseMetrics(final *DnsMetrics, validVals []float32) (*DnsMetrics, error) {
var e error
var t float32

final.SuccessRate = float64(final.SucceedCount) / float64(final.TotalCount)

// delay
if final.SucceedCount > 0 {
t, e = stats.Mean(validVals)
if e != nil {
return nil, fmt.Errorf("failed to parse mean delay, error=%v", e)
}
final.DelayForSuccess.Mean = parseTime(time.Duration(t))

t, e = stats.Max(validVals)
if e != nil {
return nil, fmt.Errorf("failed to parse max delay, error=%v", e)
}
final.DelayForSuccess.Max = parseTime(time.Duration(t))

t, e = stats.Min(validVals)
if e != nil {
return nil, fmt.Errorf("failed to parse min delay, error=%v", e)
}
final.DelayForSuccess.Min = parseTime(time.Duration(t))

t, e = stats.Percentile(validVals, 50)
if e != nil {
return nil, fmt.Errorf("failed to parse 50 Percentile, error=%v", e)
}
final.DelayForSuccess.P50 = parseTime(time.Duration(t))

t, e = stats.Percentile(validVals, 90)
if e != nil {
return nil, fmt.Errorf("failed to parse 90 Percentile, error=%v", e)
}
final.DelayForSuccess.P90 = parseTime(time.Duration(t))

t, e = stats.Percentile(validVals, 95)
if e != nil {
return nil, fmt.Errorf("failed to parse 95 Percentile, error=%v", e)
}
final.DelayForSuccess.P95 = parseTime(time.Duration(t))

t, e = stats.Percentile(validVals, 99)
if e != nil {
return nil, fmt.Errorf("failed to parse 99 Percentile, error=%v", e)
}
final.DelayForSuccess.P99 = parseTime(time.Duration(t))
}

return final, nil
}

func DnsRequest(logger *zap.Logger, req *DnsRequestData) (result *DnsMetrics, err error) {
ServerAddress := req.DnsServerAddr
l := &lock.Mutex{}
func DnsRequest(logger *zap.Logger, reqData *DnsRequestData) (result *Metrics, err error) {

logger.Sugar().Infof("dns ServerAddress=%v, request=%v, ", ServerAddress, req)
logger.Sugar().Infof("dns ServerAddress=%v, request=%v, ", reqData.DnsServerAddr, reqData)

if _, ok := dns.IsDomainName(req.TargetDomain); !ok {
return nil, fmt.Errorf("invalid domain name: %v", req.TargetDomain)
if _, ok := dns.IsDomainName(reqData.TargetDomain); !ok {
return nil, fmt.Errorf("invalid domain name: %v", reqData.TargetDomain)
}
// if not fqdn, the dns library will report error, so convert the format
if !dns.IsFqdn(req.TargetDomain) {
req.TargetDomain = dns.Fqdn(req.TargetDomain)
logger.Sugar().Debugf("convert target domain to fqdn %v", req.TargetDomain)
if !dns.IsFqdn(reqData.TargetDomain) {
reqData.TargetDomain = dns.Fqdn(reqData.TargetDomain)
logger.Sugar().Debugf("convert target domain to fqdn %v", reqData.TargetDomain)
}

rl := ratelimit.New(req.Qps)
var wg sync.WaitGroup
d := time.Duration(req.DurationInSecond) * time.Second
ctx, cancel := context.WithTimeout(context.Background(), d)
defer cancel()
var duration time.Duration
logger.Sugar().Infof("begin to request %v for duration %v ", req.TargetDomain, d.String())

// -------- send all request
start := time.Now()
counter := 0
duration := time.Duration(reqData.DurationInSecond) * time.Second

c := new(dns.Client)
c.Net = string(req.Protocol)
c.Timeout = time.Duration(req.PerRequestTimeoutInMs) * time.Millisecond
msg := new(dns.Msg).SetQuestion(req.TargetDomain, req.DnsType)
conn, _ := c.Dial(ServerAddress)
c.SingleInflight = true

final := &DnsMetrics{
ErrorMap: map[string]int{},
DnsAnswer: []dns.RR{},
ReplyCode: map[string]int{},
w := &Work{
Concurrency: config.AgentConfig.Configmap.NetdnsDefaultConcurrency,
QPS: reqData.Qps,
Timeout: reqData.PerRequestTimeoutInMs,
Msg: new(dns.Msg).SetQuestion(reqData.TargetDomain, reqData.DnsType),
Protocol: string(reqData.Protocol),
ServerAddr: reqData.DnsServerAddr,
}

validVals := []float32{}

p := func(wg *sync.WaitGroup) {
r := executeRequestOnce(c, conn, msg)
l.Lock()
final.TotalCount++
if r.e != nil {
final.FailedCount++
final.ErrorMap[r.e.Error()]++
} else {
if len(r.msg.Answer) > 0 && r.msg.Rcode == dns.RcodeSuccess {
final.SucceedCount++
validVals = append(validVals, float32(r.rtt))
} else {
final.FailedCount++
}
rcodeStr := dns.RcodeToString[r.msg.Rcode]
final.ReplyCode[rcodeStr]++
}
l.Unlock()
wg.Done()
w.Init()

// The monitoring task timed out
if duration > 0 {
go func() {
time.Sleep(duration)
w.Stop()
}()
}
logger.Sugar().Infof("begin to request %v for duration %v ", w.ServerAddr, duration.String())
w.Run()
logger.Sugar().Infof("finish all request %v for %s ", w.report.totalCount, w.ServerAddr)
// Collect metric reports
metrics := w.AggregateMetric()

LOOP:
for {
select {
case <-ctx.Done():
cancel()
duration = time.Since(start)
break LOOP

default:
rl.Take()
counter++
wg.Add(1)
go p(&wg)
}
}
wg.Wait()
end := time.Now()
logger.Sugar().Infof("finish all %v requests for %v ", counter, req.TargetDomain)
//-------- parse final metric
r, e := ParseMetrics(final, validVals)
if e != nil {
return nil, fmt.Errorf("failed to parse metric, %v", e)
}
r.StartTime = start
r.EndTime = end
r.Duration = duration.String()
r.TargetDomain = req.TargetDomain
r.DnsServer = ServerAddress
r.DnsMethod = string(req.Protocol)

logger.Sugar().Infof("result : %v ", r)
return r, nil

}
logger.Sugar().Infof("result : %v ", metrics)
return metrics, nil

func parseTime(t time.Duration) string {
return fmt.Sprintf("%.6fms", t.Seconds()*1000)
}
65 changes: 65 additions & 0 deletions pkg/loadRequest/loadDns/dns_reporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2022 Authors of spidernet-io
// SPDX-License-Identifier: Apache-2.0

package loadDns

import (
"github.com/miekg/dns"
"time"
)

// We report for max 1M results.
const maxRes = 1000000

type report struct {
avgTotal float64
average float64
tps float64

results chan *result
done chan bool
total time.Duration
errorDist map[string]int
lats []float32
totalCount int64
successCount int64
failedCount int64
ReplyCode map[string]int
}

func newReport(results chan *result) *report {
return &report{
results: results,
done: make(chan bool, 1),
errorDist: make(map[string]int),
lats: make([]float32, 0, maxRes),
ReplyCode: make(map[string]int),
}
}

func runReporter(r *report) {
// Loop will continue until channel is closed
for res := range r.results {
r.totalCount++
if res.err != nil {
r.errorDist[res.err.Error()]++
r.failedCount++
} else {
r.avgTotal += res.duration.Seconds()
if len(r.lats) < maxRes {
r.lats = append(r.lats, float32(res.duration.Milliseconds()))
}
rcodeStr := dns.RcodeToString[res.msg.Rcode]
r.ReplyCode[rcodeStr]++
r.successCount++
}
}
// Signal reporter is done.
r.done <- true
}

func (r *report) finalize(total time.Duration) {
r.total = total
r.tps = float64(r.totalCount) / r.total.Seconds()
r.average = r.avgTotal / float64(len(r.lats))
}
Loading

0 comments on commit e8c354c

Please sign in to comment.