From f4d7c3f2c55105afbf4d86e636e816f1a520ac5e Mon Sep 17 00:00:00 2001 From: Kakuya Ando Date: Wed, 23 Aug 2023 20:59:36 +0900 Subject: [PATCH] Support fetching multiple modules in one scrape (#945) * Implemented a feature to fetch from multiple modules * Changed snmp_collection_duration_seconds from Summary to Histogram --------- Signed-off-by: Kakuya Ando --- README.md | 19 ++++++++ collector/collector.go | 103 +++++++++++++++++++++++++++++++++-------- main.go | 73 ++++++++++++++--------------- 3 files changed, 138 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index e15d0f2d..d3f0a58a 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,25 @@ Note that [URL encoding](https://en.wikipedia.org/wiki/URL_encoding) should be u to the `:` and `/` characters. Prometheus encodes query parameters automatically and manual encoding is not necessary within the Prometheus configuration file. +## Multi-Module Handling +The multi-module functionality allows you to specify multiple modules, enabling the retrieval of information from several modules in a single scrape. +The concurrency can be specified using the snmp-exporter option `--snmp.module-concurrency` (the default is 1). + +Note: This implementation does not perform any de-duplication of walks between different modules. + +There are two ways to specify multiple modules. You can either separate them with a comma or define multiple params_module. +The URLs would look like this: + +For comma separation: +``` +http://localhost:9116/snmp?module=if_mib,arista_sw&target=192.0.0.8 +``` + +For multiple params_module: +``` +http://localhost:9116/snmp?module=if_mib&module=arista_sw&target=192.0.0.8 +``` + ## Configuration The default configuration file name is `snmp.yml` and should not be edited diff --git a/collector/collector.go b/collector/collector.go index 1b62288b..00616b25 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -21,6 +21,7 @@ import ( "regexp" "strconv" "strings" + "sync" "time" "github.com/alecthomas/kingpin/v2" @@ -42,6 +43,14 @@ var ( float64Mantissa uint64 = 9007199254740992 wrapCounters = kingpin.Flag("snmp.wrap-large-counters", "Wrap 64-bit counters to avoid floating point rounding.").Default("true").Bool() srcAddress = kingpin.Flag("snmp.source-address", "Source address to send snmp from in the format 'address:port' to use when connecting targets. If the port parameter is empty or '0', as in '127.0.0.1:' or '[::1]:0', a source port number is automatically (random) chosen.").Default("").String() + + snmpDuration = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "snmp_collection_duration_seconds", + Help: "Duration of collections by the SNMP exporter", + }, + []string{"auth", "module"}, + ) ) // Types preceded by an enum with their actual type. @@ -80,6 +89,14 @@ func listToOid(l []int) string { return strings.Join(result, ".") } +func InitModuleMetrics(auths map[string]*config.Auth, modules map[string]*config.Module) { + for auth := range auths { + for module := range modules { + snmpDuration.WithLabelValues(auth, module) + } + } +} + type ScrapeResults struct { pdus []gosnmp.SnmpPDU packets uint64 @@ -355,13 +372,27 @@ type internalMetrics struct { snmpRetries prometheus.Counter } +type NamedModule struct { + *config.Module + name string +} + +func NewNamedModule(name string, module *config.Module) *NamedModule { + return &NamedModule{ + Module: module, + name: name, + } +} + type Collector struct { - ctx context.Context - target string - auth *config.Auth - module *config.Module - logger log.Logger - metrics internalMetrics + ctx context.Context + target string + auth *config.Auth + authName string + modules []*NamedModule + logger log.Logger + metrics internalMetrics + concurrency int } func newInternalMetrics(reg prometheus.Registerer) internalMetrics { @@ -403,9 +434,9 @@ func newInternalMetrics(reg prometheus.Registerer) internalMetrics { } } -func New(ctx context.Context, target string, auth *config.Auth, module *config.Module, logger log.Logger, reg prometheus.Registerer) *Collector { +func New(ctx context.Context, target, authName string, auth *config.Auth, modules []*NamedModule, logger log.Logger, reg prometheus.Registerer, conc int) *Collector { internalMetrics := newInternalMetrics(reg) - return &Collector{ctx: ctx, target: target, auth: auth, module: module, logger: logger, metrics: internalMetrics} + return &Collector{ctx: ctx, target: target, authName: authName, auth: auth, modules: modules, logger: logger, metrics: internalMetrics, concurrency: conc} } // Describe implements Prometheus.Collector. @@ -413,29 +444,30 @@ func (c Collector) Describe(ch chan<- *prometheus.Desc) { ch <- prometheus.NewDesc("dummy", "dummy", nil, nil) } -// Collect implements Prometheus.Collector. -func (c Collector) Collect(ch chan<- prometheus.Metric) { +func (c Collector) collect(ch chan<- prometheus.Metric, module *NamedModule) { + logger := log.With(c.logger, "module", module.name) start := time.Now() - results, err := ScrapeTarget(c.ctx, c.target, c.auth, c.module, c.logger, c.metrics) + results, err := ScrapeTarget(c.ctx, c.target, c.auth, module.Module, logger, c.metrics) + moduleLabel := prometheus.Labels{"module": module.name} if err != nil { - level.Info(c.logger).Log("msg", "Error scraping target", "err", err) - ch <- prometheus.NewInvalidMetric(prometheus.NewDesc("snmp_error", "Error scraping target", nil, nil), err) + level.Info(logger).Log("msg", "Error scraping target", "err", err) + ch <- prometheus.NewInvalidMetric(prometheus.NewDesc("snmp_error", "Error scraping target", nil, moduleLabel), err) return } ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc("snmp_scrape_walk_duration_seconds", "Time SNMP walk/bulkwalk took.", nil, nil), + prometheus.NewDesc("snmp_scrape_walk_duration_seconds", "Time SNMP walk/bulkwalk took.", nil, moduleLabel), prometheus.GaugeValue, time.Since(start).Seconds()) ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc("snmp_scrape_packets_sent", "Packets sent for get, bulkget, and walk; including retries.", nil, nil), + prometheus.NewDesc("snmp_scrape_packets_sent", "Packets sent for get, bulkget, and walk; including retries.", nil, moduleLabel), prometheus.GaugeValue, float64(results.packets)) ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc("snmp_scrape_packets_retried", "Packets retried for get, bulkget, and walk.", nil, nil), + prometheus.NewDesc("snmp_scrape_packets_retried", "Packets retried for get, bulkget, and walk.", nil, moduleLabel), prometheus.GaugeValue, float64(results.retries)) ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc("snmp_scrape_pdus_returned", "PDUs returned from get, bulkget, and walk.", nil, nil), + prometheus.NewDesc("snmp_scrape_pdus_returned", "PDUs returned from get, bulkget, and walk.", nil, moduleLabel), prometheus.GaugeValue, float64(len(results.pdus))) oidToPdu := make(map[string]gosnmp.SnmpPDU, len(results.pdus)) @@ -443,7 +475,7 @@ func (c Collector) Collect(ch chan<- prometheus.Metric) { oidToPdu[pdu.Name[1:]] = pdu } - metricTree := buildMetricTree(c.module.Metrics) + metricTree := buildMetricTree(module.Metrics) // Look for metrics that match each pdu. PduLoop: for oid, pdu := range oidToPdu { @@ -457,7 +489,7 @@ PduLoop: } if head.metric != nil { // Found a match. - samples := pduToSamples(oidList[i+1:], &pdu, head.metric, oidToPdu, c.logger, c.metrics) + samples := pduToSamples(oidList[i+1:], &pdu, head.metric, oidToPdu, logger, c.metrics) for _, sample := range samples { ch <- sample } @@ -466,11 +498,42 @@ PduLoop: } } ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc("snmp_scrape_duration_seconds", "Total SNMP time scrape took (walk and processing).", nil, nil), + prometheus.NewDesc("snmp_scrape_duration_seconds", "Total SNMP time scrape took (walk and processing).", nil, moduleLabel), prometheus.GaugeValue, time.Since(start).Seconds()) } +// Collect implements Prometheus.Collector. +func (c Collector) Collect(ch chan<- prometheus.Metric) { + wg := sync.WaitGroup{} + workerCount := c.concurrency + if workerCount < 1 { + workerCount = 1 + } + workerChan := make(chan *NamedModule) + for i := 0; i < workerCount; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for m := range workerChan { + logger := log.With(c.logger, "module", m.name) + level.Debug(logger).Log("msg", "Starting scrape") + start := time.Now() + c.collect(ch, m) + duration := time.Since(start).Seconds() + level.Debug(logger).Log("msg", "Finished scrape", "duration_seconds", duration) + snmpDuration.WithLabelValues(c.authName, m.name).Observe(duration) + } + }() + } + + for _, module := range c.modules { + workerChan <- module + } + close(workerChan) + wg.Wait() +} + func getPduValue(pdu *gosnmp.SnmpPDU) float64 { switch pdu.Type { case gosnmp.Counter64: diff --git a/main.go b/main.go index c28316f4..f89a402d 100644 --- a/main.go +++ b/main.go @@ -19,9 +19,9 @@ import ( _ "net/http/pprof" "os" "os/signal" + "strings" "sync" "syscall" - "time" "github.com/alecthomas/kingpin/v2" "github.com/go-kit/log" @@ -43,6 +43,7 @@ import ( var ( configFile = kingpin.Flag("config.file", "Path to configuration file.").Default("snmp.yml").String() dryRun = kingpin.Flag("dry-run", "Only verify configuration is valid and exit.").Default("false").Bool() + concurrency = kingpin.Flag("snmp.module-concurrency", "The number of modules to fetch concurrently per scrape").Default("1").Int() metricsPath = kingpin.Flag( "web.telemetry-path", "Path under which to expose metrics.", @@ -50,13 +51,6 @@ var ( toolkitFlags = webflag.AddFlags(kingpin.CommandLine, ":9116") // Metrics about the SNMP exporter itself. - snmpDuration = promauto.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "snmp_collection_duration_seconds", - Help: "Duration of collections by the SNMP exporter", - }, - []string{"auth", "module"}, - ) snmpRequestErrors = promauto.NewCounter( prometheus.CounterOpts{ Name: "snmp_request_errors_total", @@ -94,44 +88,50 @@ func handler(w http.ResponseWriter, r *http.Request, logger log.Logger) { authName = "public_v2" } - moduleName := query.Get("module") - if len(query["module"]) > 1 { - http.Error(w, "'module' parameter must only be specified once", http.StatusBadRequest) - snmpRequestErrors.Inc() - return + queryModule := query["module"] + if len(queryModule) == 0 { + queryModule = append(queryModule, "if_mib") } - if moduleName == "" { - moduleName = "if_mib" + uniqueM := make(map[string]bool) + var modules []string + for _, qm := range queryModule { + for _, m := range strings.Split(qm, ",") { + if m == "" { + continue + } + if _, ok := uniqueM[m]; !ok { + uniqueM[m] = true + modules = append(modules, m) + } + } } - sc.RLock() auth, authOk := sc.C.Auths[authName] - module, moduleOk := sc.C.Modules[moduleName] - sc.RUnlock() if !authOk { + sc.RUnlock() http.Error(w, fmt.Sprintf("Unknown auth '%s'", authName), http.StatusBadRequest) snmpRequestErrors.Inc() return } - if !moduleOk { - http.Error(w, fmt.Sprintf("Unknown module '%s'", moduleName), http.StatusBadRequest) - snmpRequestErrors.Inc() - return + var nmodules []*collector.NamedModule + for _, m := range modules { + module, moduleOk := sc.C.Modules[m] + if !moduleOk { + sc.RUnlock() + http.Error(w, fmt.Sprintf("Unknown module '%s'", m), http.StatusBadRequest) + snmpRequestErrors.Inc() + return + } + nmodules = append(nmodules, collector.NewNamedModule(m, module)) } - - logger = log.With(logger, "auth", authName, "module", moduleName, "target", target) - level.Debug(logger).Log("msg", "Starting scrape") - - start := time.Now() + sc.RUnlock() + logger = log.With(logger, "auth", authName, "target", target) registry := prometheus.NewRegistry() - c := collector.New(r.Context(), target, auth, module, logger, registry) + c := collector.New(r.Context(), target, authName, auth, nmodules, logger, registry, *concurrency) registry.MustRegister(c) // Delegate http serving to Prometheus client library, which will call collector.Collect. h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) h.ServeHTTP(w, r) - duration := time.Since(start).Seconds() - snmpDuration.WithLabelValues(authName, moduleName).Observe(duration) - level.Debug(logger).Log("msg", "Finished scrape", "duration_seconds", duration) } func updateConfiguration(w http.ResponseWriter, r *http.Request) { @@ -160,11 +160,7 @@ func (sc *SafeConfig) ReloadConfig(configFile string) (err error) { sc.Lock() sc.C = conf // Initialize metrics. - for auth := range sc.C.Auths { - for module := range sc.C.Modules { - snmpDuration.WithLabelValues(auth, module) - } - } + collector.InitModuleMetrics(sc.C.Auths, sc.C.Modules) sc.Unlock() return nil } @@ -176,8 +172,11 @@ func main() { kingpin.HelpFlag.Short('h') kingpin.Parse() logger := promlog.New(promlogConfig) + if *concurrency < 1 { + *concurrency = 1 + } - level.Info(logger).Log("msg", "Starting snmp_exporter", "version", version.Info()) + level.Info(logger).Log("msg", "Starting snmp_exporter", "version", version.Info(), "concurrency", concurrency) level.Info(logger).Log("build_context", version.BuildContext()) prometheus.MustRegister(version.NewCollector("snmp_exporter"))