Skip to content

Commit

Permalink
Add trace to cpuevents (#442)
Browse files Browse the repository at this point in the history
Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn>
  • Loading branch information
dxsup authored Feb 13, 2023
1 parent e061b04 commit 3c68929
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 9 deletions.
9 changes: 5 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

## Unreleased
### New features
- Add trace to cpuevents to display the payload of network flows. ([#442](https://github.com/KindlingProject/kindling/pull/442))
- Support Attach Agent for NoAPM Java Application. ([#431](https://github.com/KindlingProject/kindling/pull/431))

### Enhancements
Expand Down Expand Up @@ -38,9 +39,9 @@
- Add the missing timestamp of TCP connect data and filter the incorrect one without srcPort.([#405](https://github.com/KindlingProject/kindling/pull/405))
- Fix the bug that multiple events cannot be correlated when they are in one ON-CPU data. ([#395](https://github.com/KindlingProject/kindling/pull/395))
- Add the missed latency field for `cgoEvent` to fix the bug where the `request_sent_time` in `single_net_request_metric_group` is always 0. ([#394](https://github.com/KindlingProject/kindling/pull/394))
- Fix http-100 request is detected as NOSUPPORT([393](https://github.com/KindlingProject/kindling/pull/393))
- Fix the wrong thread name in the trace profiling function. ([#385])(https://github.com/KindlingProject/kindling/pull/385)
- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369])(https://github.com/KindlingProject/kindling/pull/369)
- Fix http-100 request is detected as NOSUPPORT([#393](https://github.com/KindlingProject/kindling/pull/393))
- Fix the wrong thread name in the trace profiling function. ([#385](https://github.com/KindlingProject/kindling/pull/385))
- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369](https://github.com/KindlingProject/kindling/pull/369))
- Fix the bug where the pod metadata with persistent IP in the map is deleted incorrectly due to the deleting mechanism with a delay. ([#374](https://github.com/KindlingProject/kindling/pull/374))
- Fix the bug that when the response is nil, the NAT IP and port are not added to the labels of the "DataGroup". ([#378](https://github.com/KindlingProject/kindling/pull/378))
- Fix potential deadlock of exited thread delay queue. ([#373](https://github.com/KindlingProject/kindling/pull/373))
Expand Down Expand Up @@ -124,7 +125,7 @@

## v0.2.0 - 2022-05-07
### Features
- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling website[http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/]
- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling [website](http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/).
- Support network performance, DNS performance, service network maps, and workload performance analysis.
- Support HTTP, MySQL, and REDIS request analysis.
- Provide a Grafana-plugin with four built-in dashboards to support basic analysis features.
Expand Down
2 changes: 1 addition & 1 deletion collector/docker/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ analyzers:
networkanalyzer:
connect_timeout: 100
# How many seconds to wait until we consider a request as complete.
fd_reuse_timeout: 15
fd_reuse_timeout: 2
# How many seconds to wait until we consider a request as no response.
no_response_threshold: 120
# How many milliseconds to wait until we consider a request-response as slow.
Expand Down
2 changes: 1 addition & 1 deletion collector/internal/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (a *Application) Shutdown() error {

func (a *Application) registerFactory() {
a.componentsFactory.RegisterReceiver(cgoreceiver.Cgo, cgoreceiver.NewCgoReceiver, &cgoreceiver.Config{})
a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, &network.Config{})
a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, network.NewDefaultConfig())
a.componentsFactory.RegisterAnalyzer(cpuanalyzer.CpuProfile.String(), cpuanalyzer.NewCpuAnalyzer, cpuanalyzer.NewDefaultConfig())
a.componentsFactory.RegisterProcessor(k8sprocessor.K8sMetadata, k8sprocessor.NewKubernetesProcessor, &k8sprocessor.DefaultConfig)
a.componentsFactory.RegisterExporter(otelexporter.Otel, otelexporter.NewExporter, &otelexporter.Config{})
Expand Down
12 changes: 12 additions & 0 deletions collector/pkg/component/analyzer/cpuanalyzer/cpu_analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/Kindling-project/kindling/collector/pkg/component/analyzer"
"github.com/Kindling-project/kindling/collector/pkg/component/consumer"
"github.com/Kindling-project/kindling/collector/pkg/model"
"github.com/Kindling-project/kindling/collector/pkg/model/constlabels"
"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
)

Expand Down Expand Up @@ -130,6 +131,17 @@ func (ca *CpuAnalyzer) ConsumeSpanEvent(event *model.KindlingEvent) {
ca.PutEventToSegments(event.GetPid(), event.Ctx.ThreadInfo.GetTid(), event.Ctx.ThreadInfo.Comm, ev)
}

func (ca *CpuAnalyzer) ConsumeTraces(trace SendTriggerEvent) {
tid := trace.OriginalData.Labels.GetIntValue(constlabels.RequestTid)
threadName := trace.OriginalData.Labels.GetStringValue(constlabels.Comm)
event := &InnerCall{
StartTime: trace.StartTime,
EndTime: trace.StartTime + trace.SpendTime,
Trace: trace.OriginalData,
}
ca.PutEventToSegments(trace.Pid, uint32(tid), threadName, event)
}

func (ca *CpuAnalyzer) ConsumeCpuEvent(event *model.KindlingEvent) {
ev := new(CpuEvent)
for i := 0; i < int(event.ParamsNumber); i++ {
Expand Down
30 changes: 29 additions & 1 deletion collector/pkg/component/analyzer/cpuanalyzer/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ const (
TimedJavaFutexEventKind
TimedTransactionIdEventKind
TimedApmSpanEventKind
TimedInnerCallEventKind
)

const (
CpuEventLabel = "cpuEvents"
JavaFutexEventLabel = "javaFutexEvents"
TransactionIdEventLabel = "transactionIds"
SpanLabel = "spans"
InnerCallLabel = "innerCalls"
)

type TimedEvent interface {
Expand Down Expand Up @@ -49,6 +51,7 @@ type Segment struct {
JavaFutexEvents []TimedEvent `json:"javaFutexEvents"`
TransactionIds []TimedEvent `json:"transactionIds"`
Spans []TimedEvent `json:"spans"`
InnerCalls []TimedEvent `json:"innerCalls"`
IsSend int
IndexTimestamp string `json:"indexTimestamp"`
}
Expand All @@ -61,6 +64,7 @@ func newSegment(startTime uint64, endTime uint64) *Segment {
JavaFutexEvents: make([]TimedEvent, 0),
TransactionIds: make([]TimedEvent, 0),
Spans: make([]TimedEvent, 0),
InnerCalls: make([]TimedEvent, 0),
IsSend: 0,
IndexTimestamp: "",
}
Expand All @@ -76,6 +80,8 @@ func (s *Segment) putTimedEvent(event TimedEvent) {
s.TransactionIds = append(s.TransactionIds, event)
case TimedApmSpanEventKind:
s.Spans = append(s.Spans, event)
case TimedInnerCallEventKind:
s.InnerCalls = append(s.InnerCalls, event)
}
}

Expand Down Expand Up @@ -103,6 +109,10 @@ func (s *Segment) toDataGroup(parent *TimeSegments) *model.DataGroup {
if err == nil {
labels.AddStringValue(SpanLabel, string(spanEventString))
}
innerCallString, err := json.Marshal(s.InnerCalls)
if err == nil {
labels.AddStringValue(InnerCallLabel, string(innerCallString))
}
return model.NewDataGroup(constnames.CameraEventGroupName, labels, s.StartTime)
}

Expand Down Expand Up @@ -183,4 +193,22 @@ func (j *ApmSpanEvent) EndTimestamp() uint64 {

func (j *ApmSpanEvent) Kind() TimedEventKind {
return TimedApmSpanEventKind
}
}

type InnerCall struct {
StartTime uint64 `json:"startTime"`
EndTime uint64 `json:"endTime"`
Trace *model.DataGroup `json:"trace"`
}

func (c *InnerCall) StartTimestamp() uint64 {
return c.StartTime
}

func (c *InnerCall) EndTimestamp() uint64 {
return c.EndTime
}

func (c *InnerCall) Kind() TimedEventKind {
return TimedInnerCallEventKind
}
14 changes: 13 additions & 1 deletion collector/pkg/component/analyzer/cpuanalyzer/send_trigger.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func ReceiveDataGroupAsSignal(data *model.DataGroup) {
})
return
}
if data.Labels.GetBoolValue(constlabels.IsSlow) {
if data.Labels.GetBoolValue(constlabels.IsSlow) || data.Labels.GetBoolValue(constlabels.IsError) {
duration, ok := data.GetMetric(constvalues.RequestTotalTime)
if !ok {
return
Expand All @@ -54,6 +54,18 @@ type SendTriggerEvent struct {
func (ca *CpuAnalyzer) ReceiveSendSignal() {
// Break the for loop if the channel is closed
for sendContent := range sendChannel {
// CpuAnalyzer consumes all traces from the client-side to add them to TimeSegments
// These traces are not considered as signals, so we skip them here. Note they won't
// be consumed by the following consumers.
if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsServer) {
ca.ConsumeTraces(sendContent)
continue
}
// Only send the slow traces as the signals
if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsSlow) {
continue
}
// Store the traces first
for _, nexConsumer := range ca.nextConsumers {
_ = nexConsumer.Consume(sendContent.OriginalData)
}
Expand Down
2 changes: 1 addition & 1 deletion deploy/agent/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ analyzers:
networkanalyzer:
connect_timeout: 100
# How many seconds to wait until we consider a request as complete.
fd_reuse_timeout: 15
fd_reuse_timeout: 2
# How many seconds to wait until we consider a request as no response.
no_response_threshold: 120
# How many milliseconds to wait until we consider a request-response as slow.
Expand Down

0 comments on commit 3c68929

Please sign in to comment.