From 5f3561b52b981969d9e7af168b8df6bef15e9871 Mon Sep 17 00:00:00 2001 From: "Zhipeng (David) Tan" Date: Sat, 27 Sep 2025 12:56:00 +0000 Subject: [PATCH 1/2] add count for errors and error timestamps for dnspropagation measurement It will skip the "no such host" error during startup but only record the real errors --- .../probes/pkg/dnspropagation/metrics.go | 8 ++- .../probes/pkg/dnspropagation/probe.go | 62 ++++++++++++++++--- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/util-images/probes/pkg/dnspropagation/metrics.go b/util-images/probes/pkg/dnspropagation/metrics.go index 862e7bc364..32c3aee349 100644 --- a/util-images/probes/pkg/dnspropagation/metrics.go +++ b/util-images/probes/pkg/dnspropagation/metrics.go @@ -34,8 +34,14 @@ var ( Name: "dns_propagation_count", Help: "Counter of the number of DNS propagation checks performed.", }, []string{"namespace", "service", "podName"}) + // DNSLookupErrors denotes the total number of failed DNS lookups. + DNSLookupErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: common.ProbeNamespace, + Name: "dns_lookup_errors_total", + Help: "Counter of the total number of DNS lookup errors.", + }, []string{"namespace", "service", "podName"}) ) func init() { - prometheus.MustRegister(DNSPropagationSeconds, DNSPropagationCount) + prometheus.MustRegister(DNSPropagationSeconds, DNSPropagationCount, DNSLookupErrors) } diff --git a/util-images/probes/pkg/dnspropagation/probe.go b/util-images/probes/pkg/dnspropagation/probe.go index cdf1308a10..a4f05e052f 100644 --- a/util-images/probes/pkg/dnspropagation/probe.go +++ b/util-images/probes/pkg/dnspropagation/probe.go @@ -21,9 +21,13 @@ import ( "errors" "flag" "fmt" + "log/slog" "math" "math/rand" "net" + + "os" + "strings" "sync" "time" @@ -35,14 +39,21 @@ import ( ) var ( - statefulSet = flag.String("dns-propagation-probe-stateful-set", "", "Name of the statefulSet workload") - service = flag.String("dns-propagation-probe-service", "", "Name of the headless service that exposes the statefulSet resides") - namespace = flag.String("dns-propagation-probe-namespace", "default", "The namespace where the statefulSet resides") - clusterDomain = flag.String("dns-propagation-probe-cluster-domain", "cluster", "Name of cluster domain where the statefulSet resides") - suffix = flag.String("dns-propagation-probe-suffix", "local", "DNS label suffix") - interval = flag.Duration("dns-propagation-probe-interval", 100*time.Millisecond, "Interval between DNS lookups") - podCount = flag.Int("dns-propagation-probe-pod-count", 0, "Number of pods in the statefulSet") - sampleCount = flag.Int("dns-propagation-probe-sample-count", 0, "Number of pods to test dns propagation against in the statefulSet, defaults to min(100, Ceil(SQRT(podCount))") + statefulSet = flag.String("dns-propagation-probe-stateful-set", "", "Name of the statefulSet workload") + service = flag.String("dns-propagation-probe-service", "", "Name of the headless service that exposes the statefulSet resides") + namespace = flag.String("dns-propagation-probe-namespace", "default", "The namespace where the statefulSet resides") + clusterDomain = flag.String("dns-propagation-probe-cluster-domain", "cluster", "Name of cluster domain where the statefulSet resides") + suffix = flag.String("dns-propagation-probe-suffix", "local", "DNS label suffix") + interval = flag.Duration("dns-propagation-probe-interval", 100*time.Millisecond, "Interval between DNS lookups") + podCount = flag.Int("dns-propagation-probe-pod-count", 0, "Number of pods in the statefulSet") + sampleCount = flag.Int("dns-propagation-probe-sample-count", 0, "Number of pods to test dns propagation against in the statefulSet, defaults to min(100, Ceil(SQRT(podCount))") + enableErrorLogging = flag.Bool("enable-error-logging", false, "Enable logging for real errors and timestamps.") + enableLatencyLogging = flag.Bool("enable-latency-logging", false, "Enable logging for latencies timestamps.") +) + +var ( + errorLogger *slog.Logger + latencyLogger *slog.Logger ) type DNSPodPropagationResult struct { @@ -67,6 +78,12 @@ func Run() { sampleCount = &f klog.Warningf("dns-propagation-probe-sample-count not set, defaulting to min(100, Ceil(SQRT(%v))= %v", *podCount, *sampleCount) } + if *enableErrorLogging { + errorLogger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) + } + if *enableLatencyLogging { + latencyLogger = slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo})) + } // creates the in-cluster config kubeConfig, err := rest.InClusterConfig() if err != nil { @@ -153,12 +170,33 @@ func runSinglePod(client kubernetes.Interface, url string, podName string, names klog.V(4).Infof("Starting dns propagation calculation for pod %s ...", url) tick := time.NewTicker(interval) defer tick.Stop() + var lookupErrorLogged = false for { select { case <-tick.C: klog.V(4).Infof("DNS lookup %s", url) if err := lookup(url); err != nil { - klog.Warningf("DNS lookup error: %v", err) + if strings.Contains(err.Error(), "no such host") { + klog.Warningf("DNS lookup error: %v", err) + continue + } + if !lookupErrorLogged { + lookupErrorLogged = true + errTimestamp := time.Now() + klog.Errorf("DNS lookup error for url %s at %v: %v", url, errTimestamp.Format(time.RFC3339), err) + if errorLogger != nil { + errorLogger.Error("DNS propagation probe failed", + "hostname", url, + "error", err.Error(), + ) + } + labels := prometheus.Labels{ + "namespace": namespace, + "service": *service, + "podName": podName, + } + DNSLookupErrors.With(labels).Inc() + } continue } endTime := time.Now() @@ -170,6 +208,12 @@ func runSinglePod(client kubernetes.Interface, url string, podName string, names } duration := endTime.Sub(timestamp) klog.V(2).Infof("Pod running time fetched for pod %s, timestamp= %v, DNS propagation duration= %v s", url, timestamp, duration) + if latencyLogger != nil { + latencyLogger.Info("DNS propagation latency recorded", + "hostname", url, + "timestamp", time.Now(), + "propagationLatency (s)", duration.Seconds()) + } return duration } } From 00bcfe53dc5f21e86bf8f3616396442b90f11f8e Mon Sep 17 00:00:00 2001 From: "Zhipeng (David) Tan" Date: Fri, 3 Oct 2025 09:57:50 +0000 Subject: [PATCH 2/2] use DNSError.isNotFound instead of string comparison --- util-images/probes/pkg/dnspropagation/probe.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util-images/probes/pkg/dnspropagation/probe.go b/util-images/probes/pkg/dnspropagation/probe.go index a4f05e052f..a0a8c4c91f 100644 --- a/util-images/probes/pkg/dnspropagation/probe.go +++ b/util-images/probes/pkg/dnspropagation/probe.go @@ -27,7 +27,6 @@ import ( "net" "os" - "strings" "sync" "time" @@ -176,7 +175,8 @@ func runSinglePod(client kubernetes.Interface, url string, podName string, names case <-tick.C: klog.V(4).Infof("DNS lookup %s", url) if err := lookup(url); err != nil { - if strings.Contains(err.Error(), "no such host") { + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) && dnsErr.IsNotFound { klog.Warningf("DNS lookup error: %v", err) continue }