Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export expvar metrics of badger to the metricsFactory #1704

Merged
merged 10 commits into from
Aug 7, 2019
67 changes: 65 additions & 2 deletions plugin/storage/badger/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
package badger

import (
"expvar"
"flag"
"io/ioutil"
"os"
"strings"
"time"

"github.com/dgraph-io/badger"
"github.com/dgraph-io/badger/options"
"github.com/spf13/viper"
"github.com/uber/jaeger-lib/metrics"
"go.uber.org/zap"
Expand Down Expand Up @@ -58,6 +61,9 @@ type Factory struct {
LastMaintenanceRun metrics.Gauge
// LastValueLogCleaned stores the timestamp (UnixNano) of the previous ValueLogGC run
LastValueLogCleaned metrics.Gauge

// Expose badger's internal expvar metrics, which are all gauge's at this point
badgerMetrics map[string]metrics.Gauge
}
}

Expand All @@ -84,6 +90,7 @@ func (f *Factory) Initialize(metricsFactory metrics.Factory, logger *zap.Logger)
f.logger = logger

opts := badger.DefaultOptions
opts.TableLoadingMode = options.MemoryMap

if f.Options.primary.Ephemeral {
opts.SyncWrites = false
Expand Down Expand Up @@ -118,7 +125,10 @@ func (f *Factory) Initialize(metricsFactory metrics.Factory, logger *zap.Logger)
f.metrics.LastMaintenanceRun = metricsFactory.Gauge(metrics.Options{Name: lastMaintenanceRunName})
f.metrics.LastValueLogCleaned = metricsFactory.Gauge(metrics.Options{Name: lastValueLogCleanedName})

f.registerBadgerExpvarMetrics(metricsFactory)

go f.maintenance()
go f.metricsCopier()

logger.Info("Badger storage configuration", zap.Any("configuration", opts))

Expand Down Expand Up @@ -150,8 +160,7 @@ func (f *Factory) CreateDependencyReader() (dependencystore.Reader, error) {

// Close Implements io.Closer and closes the underlying storage
func (f *Factory) Close() error {
f.maintenanceDone <- true

close(f.maintenanceDone)
err := f.store.Close()

// Remove tmp files if this was ephemeral storage
Expand All @@ -175,6 +184,7 @@ func (f *Factory) maintenance() {
return
case t := <-maintenanceTicker.C:
var err error

// After there's nothing to clean, the err is raised
for err == nil {
err = f.store.RunValueLogGC(0.5) // 0.5 is selected to rewrite a file if half of it can be discarded
Expand All @@ -190,3 +200,56 @@ func (f *Factory) maintenance() {
}
}
}

func (f *Factory) metricsCopier() {
metricsTicker := time.NewTicker(f.Options.primary.MetricsUpdateInterval)
defer metricsTicker.Stop()
for {
select {
case <-f.maintenanceDone:
return
case <-metricsTicker.C:
expvar.Do(func(kv expvar.KeyValue) {
if strings.HasPrefix(kv.Key, "badger") {
if intVal, ok := kv.Value.(*expvar.Int); ok {
if g, found := f.metrics.badgerMetrics[kv.Key]; found {
g.Update(intVal.Value())
}
} else if mapVal, ok := kv.Value.(*expvar.Map); ok {
mapVal.Do(func(innerKv expvar.KeyValue) {
// The metrics we're interested in have only a single inner key (dynamic name)
// and we're only interested in its value
if intVal, ok := innerKv.Value.(*expvar.Int); ok {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is catching the metrics declared like this?

NumLSMGets = expvar.NewMap("badger_lsm_level_gets_total")

Why does Badger use a map in this case? It looks like it could have more than one entry in the map, but we're collapsing all under the map's name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, if multiple values are possible, we could put that value into a tag on the metric.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently it is storing in the map the directory of the LSM or Value. We have that information in the Factory properties also, but it might indeed make sense to export that information somewhere. Unless there's a place to view properties already in Jaeger?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the tags it would look like this:

# HELP jaeger_badger_lsm_size_bytes badger_lsm_size_bytes
# TYPE jaeger_badger_lsm_size_bytes gauge
jaeger_badger_lsm_size_bytes{directory="/tmp/badger043433444"} 0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as long as there's a guarantee that map has only one entry, it makes more sense to go without the dir name in the label (since it might be different after restart)

if g, found := f.metrics.badgerMetrics[kv.Key]; found {
g.Update(intVal.Value())
}
}
})
}
}
})
}
}
}

func (f *Factory) registerBadgerExpvarMetrics(metricsFactory metrics.Factory) {
f.metrics.badgerMetrics = make(map[string]metrics.Gauge)

expvar.Do(func(kv expvar.KeyValue) {
if strings.HasPrefix(kv.Key, "badger") {
if _, ok := kv.Value.(*expvar.Int); ok {
g := metricsFactory.Gauge(metrics.Options{Name: kv.Key})
f.metrics.badgerMetrics[kv.Key] = g
} else if mapVal, ok := kv.Value.(*expvar.Map); ok {
mapVal.Do(func(innerKv expvar.KeyValue) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't this map always be empty at this point?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, since badger.Open() is called before the metrics are fetched. And badger.Open() registers the expvars (it calculates the initial sizes during Open()).

// The metrics we're interested in have only a single inner key (dynamic name)
// and we're only interested in its value
if _, ok = innerKv.Value.(*expvar.Int); ok {
g := metricsFactory.Gauge(metrics.Options{Name: kv.Key})
f.metrics.badgerMetrics[kv.Key] = g
}
})
}
}
})
}
45 changes: 44 additions & 1 deletion plugin/storage/badger/factory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,49 @@ func TestMaintenanceCodecov(t *testing.T) {
}
}

_ = f.store.Close()
err := f.store.Close()
assert.NoError(t, err)
waiter() // This should trigger the logging of error
}

func TestBadgerMetrics(t *testing.T) {
// The expvar is leaking keyparams between tests. We need to clean up a bit..
eMap := expvar.Get("badger_lsm_size_bytes").(*expvar.Map)
eMap.Init()

f := NewFactory()
v, command := config.Viperize(f.AddFlags)
command.ParseFlags([]string{
"--badger.metrics-update-interval=10ms",
})
f.InitFromViper(v)
mFactory := metricstest.NewFactory(0)
f.Initialize(mFactory, zap.NewNop())
assert.NotNil(t, f.metrics.badgerMetrics)
_, found := f.metrics.badgerMetrics["badger_memtable_gets_total"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to see a test for the expvar.Map metrics as well.

assert.True(t, found)

waiter := func(previousValue int64) int64 {
sleeps := 0
_, gs := mFactory.Snapshot()
for gs["badger_memtable_gets_total"] == previousValue && sleeps < 8 {
// Wait for the scheduler
time.Sleep(time.Duration(50) * time.Millisecond)
sleeps++
_, gs = mFactory.Snapshot()
}
assert.True(t, gs["badger_memtable_gets_total"] > previousValue)
return gs["badger_memtable_gets_total"]
}

vlogSize := waiter(0)
_, gs := mFactory.Snapshot()
assert.True(t, vlogSize > 0)
assert.True(t, gs["badger_memtable_gets_total"] > 0) // IntVal metric

_, found = gs["badger_lsm_size_bytes"] // Map metric
assert.True(t, found)

err := f.Close()
assert.NoError(t, err)
}
44 changes: 27 additions & 17 deletions plugin/storage/badger/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,20 @@ type Options struct {

// NamespaceConfig is badger's internal configuration data
type NamespaceConfig struct {
namespace string
SpanStoreTTL time.Duration
ValueDirectory string
KeyDirectory string
Ephemeral bool // Setting this to true will ignore ValueDirectory and KeyDirectory
SyncWrites bool
MaintenanceInterval time.Duration
namespace string
SpanStoreTTL time.Duration
ValueDirectory string
KeyDirectory string
Ephemeral bool // Setting this to true will ignore ValueDirectory and KeyDirectory
SyncWrites bool
MaintenanceInterval time.Duration
MetricsUpdateInterval time.Duration
}

const (
defaultMaintenanceInterval time.Duration = 5 * time.Minute
defaultTTL time.Duration = time.Hour * 72
defaultMaintenanceInterval time.Duration = 5 * time.Minute
defaultMetricsUpdateInterval time.Duration = 10 * time.Second
defaultTTL time.Duration = time.Hour * 72
)

const (
Expand All @@ -52,6 +54,7 @@ const (
suffixSpanstoreTTL = ".span-store-ttl"
suffixSyncWrite = ".consistency"
suffixMaintenanceInterval = ".maintenance-interval"
suffixMetricsInterval = ".metrics-update-interval" // Intended only for testing purposes
defaultDataDir = string(os.PathSeparator) + "data"
defaultValueDir = defaultDataDir + string(os.PathSeparator) + "values"
defaultKeysDir = defaultDataDir + string(os.PathSeparator) + "keys"
Expand All @@ -64,13 +67,14 @@ func NewOptions(primaryNamespace string, otherNamespaces ...string) *Options {

options := &Options{
primary: &NamespaceConfig{
namespace: primaryNamespace,
SpanStoreTTL: defaultTTL,
SyncWrites: false, // Performance over durability
Ephemeral: true, // Default is ephemeral storage
ValueDirectory: defaultBadgerDataDir + defaultValueDir,
KeyDirectory: defaultBadgerDataDir + defaultKeysDir,
MaintenanceInterval: defaultMaintenanceInterval,
namespace: primaryNamespace,
SpanStoreTTL: defaultTTL,
SyncWrites: false, // Performance over durability
Ephemeral: true, // Default is ephemeral storage
ValueDirectory: defaultBadgerDataDir + defaultValueDir,
KeyDirectory: defaultBadgerDataDir + defaultKeysDir,
MaintenanceInterval: defaultMaintenanceInterval,
MetricsUpdateInterval: defaultMetricsUpdateInterval,
},
}

Expand Down Expand Up @@ -112,13 +116,18 @@ func addFlags(flagSet *flag.FlagSet, nsConfig *NamespaceConfig) {
flagSet.Bool(
nsConfig.namespace+suffixSyncWrite,
nsConfig.SyncWrites,
"If all writes should be synced immediately. This will greatly reduce write performance.",
"If all writes should be synced immediately. This can impact write performance.",
)
flagSet.Duration(
nsConfig.namespace+suffixMaintenanceInterval,
nsConfig.MaintenanceInterval,
"How often the maintenance thread for values is ran. Format is time.Duration (https://golang.org/pkg/time/#Duration)",
)
flagSet.Duration(
nsConfig.namespace+suffixMetricsInterval,
nsConfig.MetricsUpdateInterval,
"How often the badger metrics are collected by Jaeger. Format is time.Duration (https://golang.org/pkg/time/#Duration)",
)
}

// InitFromViper initializes Options with properties from viper
Expand All @@ -133,6 +142,7 @@ func initFromViper(cfg *NamespaceConfig, v *viper.Viper) {
cfg.SyncWrites = v.GetBool(cfg.namespace + suffixSyncWrite)
cfg.SpanStoreTTL = v.GetDuration(cfg.namespace + suffixSpanstoreTTL)
cfg.MaintenanceInterval = v.GetDuration(cfg.namespace + suffixMaintenanceInterval)
cfg.MetricsUpdateInterval = v.GetDuration(cfg.namespace + suffixMetricsInterval)
}

// GetPrimary returns the primary namespace configuration
Expand Down