From 79e38dc71966aa34ad504310c7b36dc8ee990ba8 Mon Sep 17 00:00:00 2001 From: Bin Tang Date: Thu, 24 Nov 2022 14:04:08 +0800 Subject: [PATCH] metrics: collect the metrics of nydusd events Collect the metrics of nydus daemon events, including INIT, RUNNING and DIED. Signed-off-by: Bin Tang --- pkg/daemon/daemon.go | 5 +++ pkg/daemon/types/types.go | 10 +++--- pkg/manager/manager.go | 5 +++ pkg/manager/monitor.go | 5 +++ pkg/metrics/exporter/daemon/metrics.go | 27 ++++++++++++++++ pkg/metrics/exporter/export.go | 35 ++++++++++---------- pkg/metrics/exporter/{ => fs}/metrics.go | 15 ++++----- pkg/metrics/exporter/{ => fs}/types.go | 12 +++---- pkg/metrics/exporter/record.go | 41 ++++++++++++++++++++++++ pkg/metrics/exporter/registry.go | 13 +++++--- pkg/metrics/serve.go | 38 +++------------------- pkg/metrics/ttl/gauge.go | 1 + pkg/system/system.go | 6 +++- snapshot/snapshot.go | 37 ++++++++++----------- 14 files changed, 156 insertions(+), 94 deletions(-) create mode 100644 pkg/metrics/exporter/daemon/metrics.go rename pkg/metrics/exporter/{ => fs}/metrics.go (93%) rename pkg/metrics/exporter/{ => fs}/types.go (80%) create mode 100644 pkg/metrics/exporter/record.go diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 0345607051..4513eb19d2 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -20,6 +20,7 @@ import ( "github.com/containerd/nydus-snapshotter/config/daemonconfig" "github.com/containerd/nydus-snapshotter/pkg/daemon/types" "github.com/containerd/nydus-snapshotter/pkg/errdefs" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter" "github.com/containerd/nydus-snapshotter/pkg/supervisor" "github.com/containerd/nydus-snapshotter/pkg/utils/erofs" "github.com/containerd/nydus-snapshotter/pkg/utils/mount" @@ -188,6 +189,10 @@ func (d *Daemon) WaitUntilState(expected types.DaemonState) error { d.ID(), expected, state) } + if exportErr := exporter.RecordDaemonEvent(d.ID(), string(expected)); exportErr != nil { + log.L.Warnf("export nydusd event metric failed, daemon ID: %s, event: %s, error: %v", d.States.ID, string(expected), exportErr) + } + return nil }, retry.Attempts(20), // totally wait for 2 seconds, should be enough diff --git a/pkg/daemon/types/types.go b/pkg/daemon/types/types.go index 42ab591d4e..9df996c4e0 100644 --- a/pkg/daemon/types/types.go +++ b/pkg/daemon/types/types.go @@ -24,10 +24,12 @@ type DaemonInfo struct { } const ( - DaemonStateUnknown DaemonState = "UNKNOWN" - DaemonStateInit DaemonState = "INIT" - DaemonStateReady DaemonState = "READY" - DaemonStateRunning DaemonState = "RUNNING" + DaemonStateUnknown DaemonState = "UNKNOWN" + DaemonStateInit DaemonState = "INIT" + DaemonStateReady DaemonState = "READY" + DaemonStateRunning DaemonState = "RUNNING" + DaemonStateDied DaemonState = "DIED" + DaemonStateDestroyed DaemonState = "DESTROYED" ) func (info *DaemonInfo) DaemonState() DaemonState { diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 9c26d601c1..0aa853a8ca 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -23,6 +23,7 @@ import ( "github.com/containerd/nydus-snapshotter/pkg/daemon" "github.com/containerd/nydus-snapshotter/pkg/daemon/types" "github.com/containerd/nydus-snapshotter/pkg/errdefs" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter" "github.com/containerd/nydus-snapshotter/pkg/store" "github.com/containerd/nydus-snapshotter/pkg/supervisor" ) @@ -506,6 +507,10 @@ func (m *Manager) DestroyDaemon(d *daemon.Daemon) error { log.L.Warnf("Failed to wait for daemon, %v", err) } + if err := exporter.RecordDaemonEvent(d.ID(), string(types.DaemonStateDestroyed)); err != nil { + log.L.Warnf("export nydusd event metric failed, daemon ID: %s, event: %s, error: %v", d.ID(), string(types.DaemonStateDestroyed), err) + } + return nil } diff --git a/pkg/manager/monitor.go b/pkg/manager/monitor.go index 39f9805a29..f7e913565a 100644 --- a/pkg/manager/monitor.go +++ b/pkg/manager/monitor.go @@ -16,7 +16,9 @@ import ( "golang.org/x/sys/unix" "github.com/containerd/containerd/log" + "github.com/containerd/nydus-snapshotter/pkg/daemon/types" "github.com/containerd/nydus-snapshotter/pkg/errdefs" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter" "github.com/containerd/nydus-snapshotter/pkg/utils/retry" ) @@ -217,6 +219,9 @@ func (m *livenessMonitor) Run() { if ev.Events&(unix.EPOLLHUP|unix.EPOLLERR) != 0 { log.L.Warnf("Daemon %s died", target.id) + if err := exporter.RecordDaemonEvent(target.id, string(types.DaemonStateDied)); err != nil { + log.L.Warnf("export nydusd event metric failed, daemon ID: %s, event: %s, error: %v", target.id, string(types.DaemonStateDied), err) + } // Notify subscribers that death event happens target.notifier <- deathEvent{daemonID: target.id, path: target.path} } diff --git a/pkg/metrics/exporter/daemon/metrics.go b/pkg/metrics/exporter/daemon/metrics.go new file mode 100644 index 0000000000..001936acb5 --- /dev/null +++ b/pkg/metrics/exporter/daemon/metrics.go @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2022. Nydus Developers. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package daemon + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + daemonIDLabel = "daemon_id" + timeLabel = "time" + eventLabel = "event" +) + +var ( + NydusdEvent = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "nydusd_lifetime_events", + Help: "The lifetime events of nydus daemon.", + }, + []string{daemonIDLabel, timeLabel, eventLabel}, + ) +) diff --git a/pkg/metrics/exporter/export.go b/pkg/metrics/exporter/export.go index 873103bc1d..34ee51618f 100644 --- a/pkg/metrics/exporter/export.go +++ b/pkg/metrics/exporter/export.go @@ -16,8 +16,6 @@ import ( "github.com/pkg/errors" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" - - "github.com/containerd/nydus-snapshotter/pkg/daemon/types" ) type Opt func(*Exporter) error @@ -26,6 +24,8 @@ type Exporter struct { outputFile string } +var globalExporter *Exporter + func WithOutputFile(metricsFile string) Opt { return func(e *Exporter) error { if metricsFile == "" { @@ -40,32 +40,33 @@ func WithOutputFile(metricsFile string) Opt { } } -func NewExporter(opts ...Opt) (*Exporter, error) { +func NewExporter(opts ...Opt) error { var exp Exporter for _, o := range opts { if err := o(&exp); err != nil { - return nil, err + return err } } - return &exp, nil -} + globalExporter = &exp -func (e *Exporter) ExportFsMetrics(m *types.FsMetrics, imageRef string) error { - ReadCount.WithLabelValues(imageRef).Set(float64(m.DataRead)) - OpenFdCount.WithLabelValues(imageRef).Set(float64(m.NrOpens)) - OpenFdMaxCount.WithLabelValues(imageRef).Set(float64(m.NrMaxOpens)) - LastFopTimestamp.WithLabelValues(imageRef).Set(float64(m.LastFopTp)) + return nil +} - for _, h := range FsMetricHists { - o, err := h.ToConstHistogram(m, imageRef) - if err != nil { - return errors.Wrapf(err, "failed to new const histogram for %s", h.Desc.String()) - } - h.Save(o) +func getExporter() (*Exporter, error) { + if globalExporter == nil { + return nil, errors.New("failed to get global metric exporter") } + return globalExporter, nil +} + +func ExportOutput() error { + e, err := getExporter() + if err != nil { + return err + } return e.output() } diff --git a/pkg/metrics/exporter/metrics.go b/pkg/metrics/exporter/fs/metrics.go similarity index 93% rename from pkg/metrics/exporter/metrics.go rename to pkg/metrics/exporter/fs/metrics.go index d0eb954a6e..47a30cd6bb 100644 --- a/pkg/metrics/exporter/metrics.go +++ b/pkg/metrics/exporter/fs/metrics.go @@ -4,11 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ -package exporter +package fs import ( - "time" - "github.com/prometheus/client_golang/prometheus" "github.com/containerd/nydus-snapshotter/pkg/daemon/types" @@ -17,7 +15,6 @@ import ( var ( imageRefLabel = "image_ref" - defaultTTL = 3 * time.Minute ) var ( @@ -28,7 +25,7 @@ var ( Help: "Total number read of a nydus fs, in Byte.", }, []string{imageRefLabel}, - defaultTTL, + ttl.DefaultTTL, ) OpenFdCount = ttl.NewGaugeVecWithTTL( @@ -37,7 +34,7 @@ var ( Help: "Number of current open files.", }, []string{imageRefLabel}, - defaultTTL, + ttl.DefaultTTL, ) OpenFdMaxCount = ttl.NewGaugeVecWithTTL( @@ -46,7 +43,7 @@ var ( Help: "Number of max open files.", }, []string{imageRefLabel}, - defaultTTL, + ttl.DefaultTTL, ) LastFopTimestamp = ttl.NewGaugeVecWithTTL( @@ -55,12 +52,12 @@ var ( Help: "Timestamp of last file operation.", }, []string{imageRefLabel}, - defaultTTL, + ttl.DefaultTTL, ) ) // Fs metric histograms -var FsMetricHists = []*FsMetricHistogram{ +var MetricHists = []*MetricHistogram{ { Desc: prometheus.NewDesc( "nydusd_block_count_read_hist", diff --git a/pkg/metrics/exporter/types.go b/pkg/metrics/exporter/fs/types.go similarity index 80% rename from pkg/metrics/exporter/types.go rename to pkg/metrics/exporter/fs/types.go index 28509402ac..ec5e8e931c 100644 --- a/pkg/metrics/exporter/types.go +++ b/pkg/metrics/exporter/fs/types.go @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -package exporter +package fs import ( "fmt" @@ -50,7 +50,7 @@ func MakeFopBuckets() []uint64 { type GetCountersFn func(*types.FsMetrics) []uint64 -type FsMetricHistogram struct { +type MetricHistogram struct { Desc *prometheus.Desc Buckets []uint64 GetCounters GetCountersFn @@ -59,7 +59,7 @@ type FsMetricHistogram struct { constHist prometheus.Metric } -func (h *FsMetricHistogram) ToConstHistogram(m *types.FsMetrics, imageRef string) (prometheus.Metric, error) { +func (h *MetricHistogram) ToConstHistogram(m *types.FsMetrics, imageRef string) (prometheus.Metric, error) { var count, sum uint64 counters := h.GetCounters(m) hmap := make(map[float64]uint64) @@ -82,18 +82,18 @@ func (h *FsMetricHistogram) ToConstHistogram(m *types.FsMetrics, imageRef string ), nil } -func (h *FsMetricHistogram) Save(m prometheus.Metric) { +func (h *MetricHistogram) Save(m prometheus.Metric) { h.constHist = m } // Implement prometheus.Collector interface -func (h *FsMetricHistogram) Describe(ch chan<- *prometheus.Desc) { +func (h *MetricHistogram) Describe(ch chan<- *prometheus.Desc) { if h.Desc != nil { ch <- h.Desc } } -func (h *FsMetricHistogram) Collect(ch chan<- prometheus.Metric) { +func (h *MetricHistogram) Collect(ch chan<- prometheus.Metric) { if h.constHist != nil { ch <- h.constHist } diff --git a/pkg/metrics/exporter/record.go b/pkg/metrics/exporter/record.go new file mode 100644 index 0000000000..06a5294488 --- /dev/null +++ b/pkg/metrics/exporter/record.go @@ -0,0 +1,41 @@ +package exporter + +import ( + "time" + + "github.com/containerd/nydus-snapshotter/pkg/daemon/types" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter/daemon" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter/fs" + "github.com/pkg/errors" +) + +func RecordFsMetrics(m *types.FsMetrics, imageRef string) error { + if _, err := getExporter(); err != nil { + return err + } + + fs.ReadCount.WithLabelValues(imageRef).Set(float64(m.DataRead)) + fs.OpenFdCount.WithLabelValues(imageRef).Set(float64(m.NrOpens)) + fs.OpenFdMaxCount.WithLabelValues(imageRef).Set(float64(m.NrMaxOpens)) + fs.LastFopTimestamp.WithLabelValues(imageRef).Set(float64(m.LastFopTp)) + + for _, h := range fs.MetricHists { + o, err := h.ToConstHistogram(m, imageRef) + if err != nil { + return errors.Wrapf(err, "failed to new const histogram for %s", h.Desc.String()) + } + h.Save(o) + } + + return nil +} + +func RecordDaemonEvent(daemonID string, event string) error { + if _, err := getExporter(); err != nil { + return err + } + + daemon.NydusdEvent.WithLabelValues(daemonID, time.Now().Format("2006-01-02 15:04:05.000"), event).Inc() + + return nil +} diff --git a/pkg/metrics/exporter/registry.go b/pkg/metrics/exporter/registry.go index af68e120da..31a14d7906 100644 --- a/pkg/metrics/exporter/registry.go +++ b/pkg/metrics/exporter/registry.go @@ -7,6 +7,8 @@ package exporter import ( + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter/daemon" + "github.com/containerd/nydus-snapshotter/pkg/metrics/exporter/fs" "github.com/prometheus/client_golang/prometheus" ) @@ -16,13 +18,14 @@ var ( func init() { Registry.MustRegister( - ReadCount, - OpenFdCount, - OpenFdMaxCount, - LastFopTimestamp, + fs.ReadCount, + fs.OpenFdCount, + fs.OpenFdMaxCount, + fs.LastFopTimestamp, + daemon.NydusdEvent, ) - for _, m := range FsMetricHists { + for _, m := range fs.MetricHists { Registry.MustRegister(m) } } diff --git a/pkg/metrics/serve.go b/pkg/metrics/serve.go index 64115c73e5..7b4e76fd5c 100644 --- a/pkg/metrics/serve.go +++ b/pkg/metrics/serve.go @@ -9,8 +9,6 @@ package metrics import ( "context" - "net" - "os" "path/filepath" "time" @@ -23,14 +21,10 @@ import ( type ServerOpt func(*Server) error -const sockFileName = "metrics.sock" - type Server struct { - listener net.Listener rootDir string metricsFile string pm *manager.Manager - exp *exporter.Exporter } func WithRootDir(rootDir string) ServerOpt { @@ -70,34 +64,17 @@ func NewServer(ctx context.Context, opts ...ServerOpt) (*Server, error) { } } - exp, err := exporter.NewExporter( + err := exporter.NewExporter( exporter.WithOutputFile(s.metricsFile), ) if err != nil { return nil, errors.Wrap(err, "failed to new metric exporter") } - s.exp = exp - - sockPath := filepath.Join(s.rootDir, sockFileName) - - if _, err := os.Stat(sockPath); err == nil { - err = os.Remove(sockPath) - if err != nil { - return nil, err - } - } - ln, err := NewListener(sockPath) - if err != nil { - return nil, err - } - s.listener = ln - - log.G(ctx).Infof("Starting metrics server on %s", sockPath) return &s, nil } -func (s *Server) collectDaemonMetric(ctx context.Context) { +func (s *Server) CollectDaemonMetrics(ctx context.Context) error { // TODO(renzhen): make collect interval time configurable timer := time.NewTicker(time.Duration(1) * time.Minute) @@ -105,9 +82,9 @@ outer: for { select { case <-timer.C: + // Collect metrics from daemons. daemons := s.pm.ListDaemons() for _, d := range daemons { - for _, i := range d.Instances.List() { var sid string @@ -123,7 +100,7 @@ outer: continue } - if err := s.exp.ExportFsMetrics(fsMetrics, i.ImageID); err != nil { + if err := exporter.RecordFsMetrics(fsMetrics, i.ImageID); err != nil { log.G(ctx).Errorf("failed to export fs metrics for %s: %v", i.ImageID, err) continue } @@ -135,13 +112,6 @@ outer: break outer } } -} - -func (s *Server) Serve(ctx context.Context) error { - // Start to collect metrics from daemons periodically. - go func() { - s.collectDaemonMetric(ctx) - }() return nil } diff --git a/pkg/metrics/ttl/gauge.go b/pkg/metrics/ttl/gauge.go index 74c1b1c785..fdb4ff80c1 100644 --- a/pkg/metrics/ttl/gauge.go +++ b/pkg/metrics/ttl/gauge.go @@ -16,6 +16,7 @@ import ( var ( defaultCleanUpPeriod = 10 * time.Minute + DefaultTTL = 3 * time.Minute ) type LabelWithValue struct { diff --git a/pkg/system/system.go b/pkg/system/system.go index c49241fc6d..328c88362c 100644 --- a/pkg/system/system.go +++ b/pkg/system/system.go @@ -148,7 +148,11 @@ func (sc *Controller) registerRouter() { handler := promhttp.HandlerFor(exporter.Registry, promhttp.HandlerOpts{ ErrorHandling: promhttp.HTTPErrorOnError, }) - sc.router.Handle(endpointPromMetrics, handler) + + sc.router.Handle(endpointPromMetrics, http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { + handler.ServeHTTP(rsp, req) + exporter.ExportOutput() + })) } func (sc *Controller) describeDaemons() func(w http.ResponseWriter, r *http.Request) { diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index bcf5fed4e2..7e820492cb 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -98,6 +98,25 @@ func NewSnapshotter(ctx context.Context, cfg *config.Config) (snapshots.Snapshot return nil, errors.Wrap(err, "create daemons manager") } + metricServer, err := metrics.NewServer( + ctx, + metrics.WithRootDir(cfg.RootDir), + metrics.WithMetricsFile(cfg.MetricsFile), + metrics.WithProcessManager(manager), + ) + if err != nil { + return nil, errors.Wrap(err, "create metrics server") + } + + if cfg.EnableMetrics { + // Start to collect daemon metrics. + go func() { + if err := metricServer.CollectDaemonMetrics(ctx); err != nil { + log.L.Errorf("Failed to start export metrics, %s", err) + } + }() + } + if cfg.APISocket != "" { systemController, err := system.NewSystemController(manager, cfg.APISocket) if err != nil { @@ -163,24 +182,6 @@ func NewSnapshotter(ctx context.Context, cfg *config.Config) (snapshots.Snapshot } } - if cfg.EnableMetrics { - metricServer, err := metrics.NewServer( - ctx, - metrics.WithRootDir(cfg.RootDir), - metrics.WithMetricsFile(cfg.MetricsFile), - metrics.WithProcessManager(manager), - ) - if err != nil { - return nil, errors.Wrap(err, "create metrics server") - } - // Start metrics http server. - go func() { - if err := metricServer.Serve(ctx); err != nil { - log.L.Errorf("Failed to start metrics server, %s", err) - } - }() - } - if err := os.MkdirAll(cfg.RootDir, 0700); err != nil { return nil, err }