Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change metrics naming scheme #776

Merged
merged 1 commit into from
May 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cmd/agent/app/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,13 @@ func (b *Builder) getMetricsFactory() (metrics.Factory, error) {
if b.metricsFactory != nil {
return b.metricsFactory, nil
}
return b.Metrics.CreateMetricsFactory("jaeger_agent")

baseFactory, err := b.Metrics.CreateMetricsFactory("jaeger")
if err != nil {
return nil, err
}

return baseFactory.Namespace("agent", nil), nil
}

// CreateAgent creates the Agent
Expand Down
21 changes: 9 additions & 12 deletions cmd/collector/app/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,21 @@ type metricsBySvc struct {

// CountsBySpanType measures received, rejected, and receivedByService metrics for a format type
type CountsBySpanType struct {
// Received is the actual number of spans received from upstream
Received metrics.Counter
// Rejected is the number of spans we rejected (usually due to blacklisting)
Rejected metrics.Counter
// ReceivedBySvc maintain by-service metrics for a format type
ReceivedBySvc metricsBySvc
// RejectedBySvc is the number of spans we rejected (usually due to blacklisting) by-service
RejectedBySvc metricsBySvc
}

// NewSpanProcessorMetrics returns a SpanProcessorMetrics
func NewSpanProcessorMetrics(serviceMetrics metrics.Factory, hostMetrics metrics.Factory, otherFormatTypes []string) *SpanProcessorMetrics {
spanCounts := map[string]CountsBySpanType{
ZipkinFormatType: newCountsBySpanType(serviceMetrics.Namespace(ZipkinFormatType, nil)),
JaegerFormatType: newCountsBySpanType(serviceMetrics.Namespace(JaegerFormatType, nil)),
UnknownFormatType: newCountsBySpanType(serviceMetrics.Namespace(UnknownFormatType, nil)),
ZipkinFormatType: newCountsBySpanType(serviceMetrics.Namespace("", map[string]string{"format": ZipkinFormatType})),
JaegerFormatType: newCountsBySpanType(serviceMetrics.Namespace("", map[string]string{"format": JaegerFormatType})),
UnknownFormatType: newCountsBySpanType(serviceMetrics.Namespace("", map[string]string{"format": UnknownFormatType})),
}
for _, otherFormatType := range otherFormatTypes {
spanCounts[otherFormatType] = newCountsBySpanType(serviceMetrics.Namespace(otherFormatType, nil))
spanCounts[otherFormatType] = newCountsBySpanType(serviceMetrics.Namespace("", map[string]string{"format": otherFormatType}))
}
m := &SpanProcessorMetrics{
SaveLatency: hostMetrics.Timer("save-latency", nil),
Expand Down Expand Up @@ -115,9 +113,8 @@ func newMetricsBySvc(factory metrics.Factory, category string) metricsBySvc {

func newCountsBySpanType(factory metrics.Factory) CountsBySpanType {
return CountsBySpanType{
Received: factory.Counter("spans.recd", nil),
Rejected: factory.Counter("spans.rejected", nil),
ReceivedBySvc: newMetricsBySvc(factory, "by-svc"),
RejectedBySvc: newMetricsBySvc(factory, "rejected"),
ReceivedBySvc: newMetricsBySvc(factory, "received"),
}
}

Expand Down Expand Up @@ -179,7 +176,7 @@ func (m *countsBySvc) countByServiceName(serviceName string) {
if c, ok := m.counts[serviceName]; ok {
counter = c
} else if len(m.counts) < maxServiceNames {
c := m.factory.Counter(serviceName, nil)
c := m.factory.Counter("", map[string]string{"service": serviceName})
m.counts[serviceName] = c
counter = c
}
Expand Down
6 changes: 3 additions & 3 deletions cmd/collector/app/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ func TestProcessorMetrics(t *testing.T) {
jFormat.ReceivedBySvc.ReportServiceNameForSpan(&mSpan)
counters, gauges := baseMetrics.LocalBackend.Snapshot()

assert.EqualValues(t, 2, counters["service.jaeger.spans.by-svc.fry"])
assert.EqualValues(t, 1, counters["service.jaeger.traces.by-svc.fry"])
assert.EqualValues(t, 1, counters["service.jaeger.debug-spans.by-svc.fry"])
assert.EqualValues(t, 2, counters["service.spans.received|format=jaeger|service=fry"])
assert.EqualValues(t, 1, counters["service.traces.received|format=jaeger|service=fry"])
assert.EqualValues(t, 1, counters["service.debug-spans.received|format=jaeger|service=fry"])
assert.Empty(t, gauges)
}
3 changes: 1 addition & 2 deletions cmd/collector/app/span_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ func (sp *spanProcessor) saveSpan(span *model.Span) {

func (sp *spanProcessor) ProcessSpans(mSpans []*model.Span, spanFormat string) ([]bool, error) {
sp.preProcessSpans(mSpans)
sp.metrics.GetCountsForFormat(spanFormat).Received.Inc(int64(len(mSpans)))
sp.metrics.BatchSize.Update(int64(len(mSpans)))
retMe := make([]bool, len(mSpans))
for i, mSpan := range mSpans {
Expand All @@ -131,7 +130,7 @@ func (sp *spanProcessor) enqueueSpan(span *model.Span, originalFormat string) bo
spanCounts.ReceivedBySvc.ReportServiceNameForSpan(span)

if !sp.filterSpan(span) {
spanCounts.Rejected.Inc(int64(1))
spanCounts.RejectedBySvc.ReportServiceNameForSpan(span)
return true // as in "not dropped", because it's actively rejected
}
item := &queueItem{
Expand Down
17 changes: 9 additions & 8 deletions cmd/collector/app/span_processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ func TestBySvcMetrics(t *testing.T) {
)
ctx := context.Background()
tctx := thrift.Wrap(ctx)
var metricPrefix string
var metricPrefix, format string
if test.format == ZipkinFormatType {
span := makeZipkinSpan(test.serviceName, test.rootSpan, test.debug)
zHandler := NewZipkinSpanHandler(logger, processor, zipkinSanitizer.NewParentIDSanitizer())
zHandler.SubmitZipkinBatch(tctx, []*zc.Span{span, span})
metricPrefix = "service.zipkin"
metricPrefix = "service"
format = "zipkin"
} else if test.format == JaegerFormatType {
span, process := makeJaegerSpan(test.serviceName, test.rootSpan, test.debug)
jHandler := NewJaegerSpanHandler(logger, processor)
Expand All @@ -103,22 +104,22 @@ func TestBySvcMetrics(t *testing.T) {
Process: process,
},
})
metricPrefix = "service.jaeger"
metricPrefix = "service"
format = "jaeger"
} else {
panic("Unknown format")
}
expected := []metricsTest.ExpectedMetric{
{Name: metricPrefix + ".spans.recd", Value: 2},
{Name: metricPrefix + ".spans.by-svc." + test.serviceName, Value: 2},
{Name: metricPrefix + ".spans.received|format=" + format + "|service=" + test.serviceName, Value: 2},
}
if test.debug {
expected = append(expected, metricsTest.ExpectedMetric{
Name: metricPrefix + ".debug-spans.by-svc." + test.serviceName, Value: 2,
Name: metricPrefix + ".debug-spans.received|format=" + format + "|service=" + test.serviceName, Value: 2,
})
}
if test.rootSpan {
expected = append(expected, metricsTest.ExpectedMetric{
Name: metricPrefix + ".traces.by-svc." + test.serviceName, Value: 2,
Name: metricPrefix + ".traces.received|format=" + format + "|service=" + test.serviceName, Value: 2,
})
}
if test.serviceName != blackListedService || test.debug {
Expand All @@ -133,7 +134,7 @@ func TestBySvcMetrics(t *testing.T) {
})
} else {
expected = append(expected, metricsTest.ExpectedMetric{
Name: metricPrefix + ".spans.rejected", Value: 2,
Name: metricPrefix + ".spans.rejected|format=" + format + "|service=" + test.serviceName, Value: 2,
})
}
metricsTest.AssertCounterMetrics(t, mb, expected...)
Expand Down
5 changes: 3 additions & 2 deletions cmd/collector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,20 +90,21 @@ func main() {
builderOpts := new(builder.CollectorOptions).InitFromViper(v)

mBldr := new(pMetrics.Builder).InitFromViper(v)
metricsFactory, err := mBldr.CreateMetricsFactory("jaeger-collector")
baseFactory, err := mBldr.CreateMetricsFactory("jaeger")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing the -collector suffix here? Isn't it better to be able to tell apart which binary emitted the metric? The storage metrics won't be grouped by collector name anymore.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing the -collector suffix here?

It's just so that the jaeger prefix is used for the storage metrics without the component's name. collector is added later as a namespace to the collector-specific metrics.

Isn't it better to be able to tell apart which binary emitted the metric?

I think the agreement was to have per-component metrics, instead of per-binary, so that metrics generated by the collector via standalone-linux would be consistent with the metrics generated by collector via collector-linux:

(before)
jaeger_standalone_jaeger_collector_in_queue_latency_bucket{host="caju",le="0.005"} 0

vs

(after, across all binaries)
jaeger_collector_in_queue_latency_bucket{host="caju",le="0.005"} 0

The storage metrics won't be grouped by collector name anymore.

You mean, jaeger_cassandra_inserts{table="service_names"} would be used by collector, query and standalone? This would potentially create three different metric names for the same thing: jaeger_standalone_cassandra_inserts, jaeger_collector_cassandra_inserts, jaeger_query_cassandra_inserts. I personally prefer to have one single metrics, but I do see the point of having one per component (not binary). Just not sure how we'd tell the components apart when using the standalone.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If knowing the source binary is important, then it could be added as a label instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am really surprised Prometheus community has no guidelines around this. Can't imagine any minimally complex system not running into the same exact issues with multiple namespaces and hierarchies.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am really surprised Prometheus community has no guidelines around this. Can't imagine any minimally complex system not running into the same exact issues with multiple namespaces and hierarchies.

The Prometheus scraper can add arbitrary labels, so, this kind of information would arguably be added there, along with some metadata not known to the "microservice", like the Geo, DC, Rack, and so on. Our "binary" name could also be added at scrape time, for cases where it has more value than a consistent set of metrics across all binaries.

https://prometheus.io/docs/prometheus/latest/configuration/configuration/#%3Cstatic_config%3E

How about we merge it with the current state, publish a blog post about this change + how to add arbitrary labels, and ask for community feedback? If we hear that having this as part of the metrics we emit, we add it as a label.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should discuss on Friday. We are accumulating a set of breaking changes for 1.5: metrics, docker commands, and Cassandra schema.

if err != nil {
logger.Fatal("Cannot create metrics factory.", zap.Error(err))
}

storageFactory.InitFromViper(v)
if err := storageFactory.Initialize(metricsFactory, logger); err != nil {
if err := storageFactory.Initialize(baseFactory, logger); err != nil {
logger.Fatal("Failed to init storage factory", zap.Error(err))
}
spanWriter, err := storageFactory.CreateSpanWriter()
if err != nil {
logger.Fatal("Failed to create span writer", zap.Error(err))
}

metricsFactory := baseFactory.Namespace("collector", nil)
handlerBuilder, err := builder.NewSpanHandlerBuilder(
builderOpts,
spanWriter,
Expand Down
6 changes: 3 additions & 3 deletions cmd/query/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func main() {
queryOpts := new(app.QueryOptions).InitFromViper(v)

mBldr := new(pMetrics.Builder).InitFromViper(v)
metricsFactory, err := mBldr.CreateMetricsFactory("jaeger-query")
baseFactory, err := mBldr.CreateMetricsFactory("jaeger")
if err != nil {
logger.Fatal("Cannot create metrics factory.", zap.Error(err))
}
Expand All @@ -86,14 +86,14 @@ func main() {
Param: 1.0,
},
RPCMetrics: true,
}.New("jaeger-query", jaegerClientConfig.Metrics(metricsFactory))
}.New("jaeger-query", jaegerClientConfig.Metrics(baseFactory.Namespace("client", nil)))
if err != nil {
logger.Fatal("Failed to initialize tracer", zap.Error(err))
}
defer closer.Close()

storageFactory.InitFromViper(v)
if err := storageFactory.Initialize(metricsFactory, logger); err != nil {
if err := storageFactory.Initialize(baseFactory, logger); err != nil {
logger.Fatal("Failed to init storage factory", zap.Error(err))
}
spanReader, err := storageFactory.CreateSpanReader()
Expand Down
8 changes: 4 additions & 4 deletions cmd/standalone/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func main() {
}

mBldr := new(pMetrics.Builder).InitFromViper(v)
metricsFactory, err := mBldr.CreateMetricsFactory("jaeger-standalone")
metricsFactory, err := mBldr.CreateMetricsFactory("jaeger")
if err != nil {
return errors.Wrap(err, "Cannot create metrics factory")
}
Expand Down Expand Up @@ -166,7 +166,7 @@ func startAgent(
logger *zap.Logger,
baseFactory metrics.Factory,
) {
metricsFactory := baseFactory.Namespace("jaeger-agent", nil)
metricsFactory := baseFactory.Namespace("agent", nil)

if len(b.CollectorHostPorts) == 0 {
b.CollectorHostPorts = append(b.CollectorHostPorts, fmt.Sprintf("127.0.0.1:%d", cOpts.CollectorPort))
Expand All @@ -190,7 +190,7 @@ func startCollector(
samplingHandler sampling.Handler,
hc *healthcheck.HealthCheck,
) {
metricsFactory := baseFactory.Namespace("jaeger-collector", nil)
metricsFactory := baseFactory.Namespace("collector", nil)

spanBuilder, err := collector.NewSpanHandlerBuilder(
cOpts,
Expand Down Expand Up @@ -269,7 +269,7 @@ func startQuery(
Param: 1.0,
},
RPCMetrics: true,
}.New("jaeger-query", jaegerClientConfig.Metrics(baseFactory))
}.New("jaeger-query", jaegerClientConfig.Metrics(baseFactory.Namespace("client", nil)))
if err != nil {
logger.Fatal("Failed to initialize tracer", zap.Error(err))
}
Expand Down
2 changes: 1 addition & 1 deletion glide.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion glide.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import:
subpackages:
- transport
- package: github.com/uber/jaeger-lib
version: ^1.4.0
version: ^1.5.0
- package: github.com/uber/tchannel-go
version: v1.1.0
subpackages:
Expand Down
2 changes: 1 addition & 1 deletion pkg/cassandra/metrics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type Table struct {
// NewTable takes a metrics scope and creates a table metrics struct
func NewTable(factory metrics.Factory, tableName string) *Table {
t := storageMetrics.WriteMetrics{}
metrics.Init(&t, factory.Namespace(tableName, nil), nil)
metrics.Init(&t, factory.Namespace("", map[string]string{"table": tableName}), nil)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure about this one, this allows users to aggregate metrics across different c* tables. For attempts and inserts (total c* throughput) it makes sense but allowing users to aggregate latencies across different tables doesn't seem right (workload will be different per table). I guess giving the ability to aggregate doesn't imply people will aggregate and technically people using non-tag based metrics systems could already aggregate in the previous rendition. TLDR; I've convinced myself this is fine

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TLDR; I've convinced myself this is fine

Glad to hear :D

return &Table{t}
}

Expand Down
44 changes: 22 additions & 22 deletions pkg/cassandra/metrics/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,31 @@ func TestTableEmit(t *testing.T) {
{
err: nil,
counts: map[string]int64{
"a_table.attempts": 1,
"a_table.inserts": 1,
"attempts|table=a_table": 1,
"inserts|table=a_table": 1,
},
gauges: map[string]int64{
"a_table.latency-ok.P999": 51,
"a_table.latency-ok.P50": 51,
"a_table.latency-ok.P75": 51,
"a_table.latency-ok.P90": 51,
"a_table.latency-ok.P95": 51,
"a_table.latency-ok.P99": 51,
"latency-ok|table=a_table.P999": 51,
"latency-ok|table=a_table.P50": 51,
"latency-ok|table=a_table.P75": 51,
"latency-ok|table=a_table.P90": 51,
"latency-ok|table=a_table.P95": 51,
"latency-ok|table=a_table.P99": 51,
},
},
{
err: errors.New("some error"),
counts: map[string]int64{
"a_table.attempts": 1,
"a_table.errors": 1,
"attempts|table=a_table": 1,
"errors|table=a_table": 1,
},
gauges: map[string]int64{
"a_table.latency-err.P999": 51,
"a_table.latency-err.P50": 51,
"a_table.latency-err.P75": 51,
"a_table.latency-err.P90": 51,
"a_table.latency-err.P95": 51,
"a_table.latency-err.P99": 51,
"latency-err|table=a_table.P999": 51,
"latency-err|table=a_table.P50": 51,
"latency-err|table=a_table.P75": 51,
"latency-err|table=a_table.P90": 51,
"latency-err|table=a_table.P95": 51,
"latency-err|table=a_table.P99": 51,
},
},
}
Expand All @@ -82,8 +82,8 @@ func TestTableExec(t *testing.T) {
{
q: insertQuery{},
counts: map[string]int64{
"a_table.attempts": 1,
"a_table.inserts": 1,
"attempts|table=a_table": 1,
"inserts|table=a_table": 1,
},
},
{
Expand All @@ -92,8 +92,8 @@ func TestTableExec(t *testing.T) {
err: errors.New("failed"),
},
counts: map[string]int64{
"a_table.attempts": 1,
"a_table.errors": 1,
"attempts|table=a_table": 1,
"errors|table=a_table": 1,
},
},
{
Expand All @@ -103,8 +103,8 @@ func TestTableExec(t *testing.T) {
},
log: true,
counts: map[string]int64{
"a_table.attempts": 1,
"a_table.errors": 1,
"attempts|table=a_table": 1,
"errors|table=a_table": 1,
},
},
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/es/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (c *Configuration) NewClient(logger *zap.Logger, metricsFactory metrics.Fac
return nil, err
}

sm := storageMetrics.NewWriteMetrics(metricsFactory, "BulkIndex")
sm := storageMetrics.NewWriteMetrics(metricsFactory, "bulk_index")
m := sync.Map{}

service, err := rawClient.BulkProcessor().
Expand Down
2 changes: 1 addition & 1 deletion pkg/metrics/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func TestBuilder(t *testing.T) {
families, err := prometheus.DefaultGatherer.Gather()
require.NoError(t, err)
for _, mf := range families {
if mf.GetName() == "foo:counter" {
if mf.GetName() == "foo_counter" {
return
}
}
Expand Down
2 changes: 1 addition & 1 deletion plugin/storage/cassandra/dependencystore/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func NewDependencyStore(
return &DependencyStore{
session: session,
dependencyDataFrequency: dependencyDataFrequency,
dependenciesTableMetrics: casMetrics.NewTable(metricsFactory, "Dependencies"),
dependenciesTableMetrics: casMetrics.NewTable(metricsFactory, "dependencies"),
logger: logger,
}
}
Expand Down
4 changes: 2 additions & 2 deletions plugin/storage/cassandra/samplingstore/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ func New(session cassandra.Session, factory metrics.Factory, logger *zap.Logger)
return &SamplingStore{
session: session,
metrics: samplingStoreMetrics{
operationThroughput: casMetrics.NewTable(factory, "OperationThroughput"),
probabilities: casMetrics.NewTable(factory, "Probabilities"),
operationThroughput: casMetrics.NewTable(factory, "operation_throughput"),
probabilities: casMetrics.NewTable(factory, "probabilities"),
},
logger: logger,
}
Expand Down
2 changes: 1 addition & 1 deletion plugin/storage/cassandra/spanstore/operation_names.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func NewOperationNamesStorage(
session: session,
InsertStmt: insertOperationName,
QueryStmt: queryOperationNames,
metrics: casMetrics.NewTable(metricsFactory, "OperationNames"),
metrics: casMetrics.NewTable(metricsFactory, "operation_names"),
writeCacheTTL: writeCacheTTL,
logger: logger,
operationNames: cache.NewLRUWithOptions(
Expand Down
Loading