Add trace to cpuevents (#442)

Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn>
KindlingProject · Feb 13, 2023 · 3c68929 · 3c68929
1 parent e061b04
commit 3c68929
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 
 ## Unreleased
 ### New features
+- Add trace to cpuevents to display the payload of network flows. ([#442](https://github.com/KindlingProject/kindling/pull/442))
 - Support Attach Agent for NoAPM Java Application. ([#431](https://github.com/KindlingProject/kindling/pull/431))
 
 ### Enhancements
@@ -38,9 +39,9 @@
 - Add the missing timestamp of TCP connect data and filter the incorrect one without srcPort.([#405](https://github.com/KindlingProject/kindling/pull/405))
 - Fix the bug that multiple events cannot be correlated when they are in one ON-CPU data. ([#395](https://github.com/KindlingProject/kindling/pull/395))
 - Add the missed latency field for `cgoEvent` to fix the bug where the `request_sent_time` in `single_net_request_metric_group` is always 0. ([#394](https://github.com/KindlingProject/kindling/pull/394))
-- Fix http-100 request is detected as NOSUPPORT([393](https://github.com/KindlingProject/kindling/pull/393))
-- Fix the wrong thread name in the trace profiling function. ([#385])(https://github.com/KindlingProject/kindling/pull/385)
-- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369])(https://github.com/KindlingProject/kindling/pull/369)
+- Fix http-100 request is detected as NOSUPPORT([#393](https://github.com/KindlingProject/kindling/pull/393))
+- Fix the wrong thread name in the trace profiling function. ([#385](https://github.com/KindlingProject/kindling/pull/385))
+- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369](https://github.com/KindlingProject/kindling/pull/369))
 - Fix the bug where the pod metadata with persistent IP in the map is deleted incorrectly due to the deleting mechanism with a delay. ([#374](https://github.com/KindlingProject/kindling/pull/374))
 - Fix the bug that when the response is nil, the NAT IP and port are not added to the labels of the "DataGroup". ([#378](https://github.com/KindlingProject/kindling/pull/378))
 - Fix potential deadlock of exited thread delay queue. ([#373](https://github.com/KindlingProject/kindling/pull/373))
@@ -124,7 +125,7 @@
 
 ## v0.2.0 - 2022-05-07
 ### Features
-- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling website[http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/]
+- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling [website](http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/).
 - Support network performance, DNS performance, service network maps, and workload performance analysis.
 - Support HTTP, MySQL, and REDIS request analysis.
 - Provide a Grafana-plugin with four built-in dashboards to support basic analysis features.

diff --git a/collector/docker/kindling-collector-config.yml b/collector/docker/kindling-collector-config.yml
@@ -48,7 +48,7 @@ analyzers:
   networkanalyzer:
     connect_timeout: 100
     # How many seconds to wait until we consider a request as complete.
-    fd_reuse_timeout: 15
+    fd_reuse_timeout: 2
     # How many seconds to wait until we consider a request as no response.
     no_response_threshold: 120
     # How many milliseconds to wait until we consider a request-response as slow.

diff --git a/collector/internal/application/application.go b/collector/internal/application/application.go
@@ -76,7 +76,7 @@ func (a *Application) Shutdown() error {
 
 func (a *Application) registerFactory() {
 	a.componentsFactory.RegisterReceiver(cgoreceiver.Cgo, cgoreceiver.NewCgoReceiver, &cgoreceiver.Config{})
-	a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, &network.Config{})
+	a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, network.NewDefaultConfig())
 	a.componentsFactory.RegisterAnalyzer(cpuanalyzer.CpuProfile.String(), cpuanalyzer.NewCpuAnalyzer, cpuanalyzer.NewDefaultConfig())
 	a.componentsFactory.RegisterProcessor(k8sprocessor.K8sMetadata, k8sprocessor.NewKubernetesProcessor, &k8sprocessor.DefaultConfig)
 	a.componentsFactory.RegisterExporter(otelexporter.Otel, otelexporter.NewExporter, &otelexporter.Config{})

diff --git a/collector/pkg/component/analyzer/cpuanalyzer/cpu_analyzer.go b/collector/pkg/component/analyzer/cpuanalyzer/cpu_analyzer.go
@@ -13,6 +13,7 @@ import (
 	"github.com/Kindling-project/kindling/collector/pkg/component/analyzer"
 	"github.com/Kindling-project/kindling/collector/pkg/component/consumer"
 	"github.com/Kindling-project/kindling/collector/pkg/model"
+	"github.com/Kindling-project/kindling/collector/pkg/model/constlabels"
 	"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
 )
 
@@ -130,6 +131,17 @@ func (ca *CpuAnalyzer) ConsumeSpanEvent(event *model.KindlingEvent) {
 	ca.PutEventToSegments(event.GetPid(), event.Ctx.ThreadInfo.GetTid(), event.Ctx.ThreadInfo.Comm, ev)
 }
 
+func (ca *CpuAnalyzer) ConsumeTraces(trace SendTriggerEvent) {
+	tid := trace.OriginalData.Labels.GetIntValue(constlabels.RequestTid)
+	threadName := trace.OriginalData.Labels.GetStringValue(constlabels.Comm)
+	event := &InnerCall{
+		StartTime: trace.StartTime,
+		EndTime:   trace.StartTime + trace.SpendTime,
+		Trace:     trace.OriginalData,
+	}
+	ca.PutEventToSegments(trace.Pid, uint32(tid), threadName, event)
+}
+
 func (ca *CpuAnalyzer) ConsumeCpuEvent(event *model.KindlingEvent) {
 	ev := new(CpuEvent)
 	for i := 0; i < int(event.ParamsNumber); i++ {

diff --git a/collector/pkg/component/analyzer/cpuanalyzer/model.go b/collector/pkg/component/analyzer/cpuanalyzer/model.go
@@ -15,13 +15,15 @@ const (
 	TimedJavaFutexEventKind
 	TimedTransactionIdEventKind
 	TimedApmSpanEventKind
+	TimedInnerCallEventKind
 )
 
 const (
 	CpuEventLabel           = "cpuEvents"
 	JavaFutexEventLabel     = "javaFutexEvents"
 	TransactionIdEventLabel = "transactionIds"
 	SpanLabel               = "spans"
+	InnerCallLabel          = "innerCalls"
 )
 
 type TimedEvent interface {
@@ -49,6 +51,7 @@ type Segment struct {
 	JavaFutexEvents []TimedEvent `json:"javaFutexEvents"`
 	TransactionIds  []TimedEvent `json:"transactionIds"`
 	Spans           []TimedEvent `json:"spans"`
+	InnerCalls      []TimedEvent `json:"innerCalls"`
 	IsSend          int
 	IndexTimestamp  string `json:"indexTimestamp"`
 }
@@ -61,6 +64,7 @@ func newSegment(startTime uint64, endTime uint64) *Segment {
 		JavaFutexEvents: make([]TimedEvent, 0),
 		TransactionIds:  make([]TimedEvent, 0),
 		Spans:           make([]TimedEvent, 0),
+		InnerCalls:      make([]TimedEvent, 0),
 		IsSend:          0,
 		IndexTimestamp:  "",
 	}
@@ -76,6 +80,8 @@ func (s *Segment) putTimedEvent(event TimedEvent) {
 		s.TransactionIds = append(s.TransactionIds, event)
 	case TimedApmSpanEventKind:
 		s.Spans = append(s.Spans, event)
+	case TimedInnerCallEventKind:
+		s.InnerCalls = append(s.InnerCalls, event)
 	}
 }
 
@@ -103,6 +109,10 @@ func (s *Segment) toDataGroup(parent *TimeSegments) *model.DataGroup {
 	if err == nil {
 		labels.AddStringValue(SpanLabel, string(spanEventString))
 	}
+	innerCallString, err := json.Marshal(s.InnerCalls)
+	if err == nil {
+		labels.AddStringValue(InnerCallLabel, string(innerCallString))
+	}
 	return model.NewDataGroup(constnames.CameraEventGroupName, labels, s.StartTime)
 }
 
@@ -183,4 +193,22 @@ func (j *ApmSpanEvent) EndTimestamp() uint64 {
 
 func (j *ApmSpanEvent) Kind() TimedEventKind {
 	return TimedApmSpanEventKind
-}
+}
+
+type InnerCall struct {
+	StartTime uint64           `json:"startTime"`
+	EndTime   uint64           `json:"endTime"`
+	Trace     *model.DataGroup `json:"trace"`
+}
+
+func (c *InnerCall) StartTimestamp() uint64 {
+	return c.StartTime
+}
+
+func (c *InnerCall) EndTimestamp() uint64 {
+	return c.EndTime
+}
+
+func (c *InnerCall) Kind() TimedEventKind {
+	return TimedInnerCallEventKind
+}
diff --git a/collector/pkg/component/analyzer/cpuanalyzer/send_trigger.go b/collector/pkg/component/analyzer/cpuanalyzer/send_trigger.go
@@ -29,7 +29,7 @@ func ReceiveDataGroupAsSignal(data *model.DataGroup) {
 		})
 		return
 	}
-	if data.Labels.GetBoolValue(constlabels.IsSlow) {
+	if data.Labels.GetBoolValue(constlabels.IsSlow) || data.Labels.GetBoolValue(constlabels.IsError) {
 		duration, ok := data.GetMetric(constvalues.RequestTotalTime)
 		if !ok {
 			return
@@ -54,6 +54,18 @@ type SendTriggerEvent struct {
 func (ca *CpuAnalyzer) ReceiveSendSignal() {
 	// Break the for loop if the channel is closed
 	for sendContent := range sendChannel {
+		// CpuAnalyzer consumes all traces from the client-side to add them to TimeSegments
+		// These traces are not considered as signals, so we skip them here. Note they won't
+		// be consumed by the following consumers.
+		if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsServer) {
+			ca.ConsumeTraces(sendContent)
+			continue
+		}
+		// Only send the slow traces as the signals
+		if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsSlow) {
+			continue
+		}
+		// Store the traces first
 		for _, nexConsumer := range ca.nextConsumers {
 			_ = nexConsumer.Consume(sendContent.OriginalData)
 		}

diff --git a/deploy/agent/kindling-collector-config.yml b/deploy/agent/kindling-collector-config.yml
@@ -48,7 +48,7 @@ analyzers:
   networkanalyzer:
     connect_timeout: 100
     # How many seconds to wait until we consider a request as complete.
-    fd_reuse_timeout: 15
+    fd_reuse_timeout: 2
     # How many seconds to wait until we consider a request as no response.
     no_response_threshold: 120
     # How many milliseconds to wait until we consider a request-response as slow.