argoproj · terrytangyuan · Jul 7, 2023 · Jul 5, 2023 · Jul 7, 2023 · Jul 7, 2023
diff --git a/cmd/workflow-controller/main.go b/cmd/workflow-controller/main.go
@@ -117,6 +117,7 @@ func NewRootCommand() *cobra.Command {
  log.Info("Leader election is turned off. Running in single-instance mode")
  log.WithField("id", "single-instance").Info("starting leading")
  go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podCleanupWorkers)
+ go wfController.RunMetricsServer(ctx, false)
  } else {
  nodeID, ok := os.LookupEnv("LEADER_ELECTION_IDENTITY")
  if !ok {
@@ -128,6 +129,11 @@ func NewRootCommand() *cobra.Command {
  leaderName = fmt.Sprintf("%s-%s", leaderName, wfController.Config.InstanceID)
  }
 
+ // for controlling the dummy metrics server
+ dummyCtx, dummyCancel := context.WithCancel(context.Background())
+ defer dummyCancel()
+ go wfController.RunMetricsServer(dummyCtx, true)
+
  go leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{
  Lock: &resourcelock.LeaseLock{
  LeaseMeta: metav1.ObjectMeta{Name: leaderName, Namespace: namespace}, Client: kubeclientset.CoordinationV1(),
@@ -139,11 +145,14 @@ func NewRootCommand() *cobra.Command {
  RetryPeriod: env.LookupEnvDurationOr("LEADER_ELECTION_RETRY_PERIOD", 5*time.Second),
  Callbacks: leaderelection.LeaderCallbacks{
  OnStartedLeading: func(ctx context.Context) {
+ dummyCancel()
  go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podCleanupWorkers)
+ go wfController.RunMetricsServer(ctx, false)
  },
  OnStoppedLeading: func() {
  log.WithField("id", nodeID).Info("stopped leading")
  cancel()
+ go wfController.RunMetricsServer(dummyCtx, true)
  },
  OnNewLeader: func(identity string) {
  log.WithField("leader", identity).Info("new leader")

diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go
@@ -298,9 +298,6 @@ func (wfc *WorkflowController) Run(ctx context.Context, wfWorkers, workflowTTLWo
  log.Fatal("Timed out waiting for caches to sync")
  }
 
- // Start the metrics server
- go wfc.metrics.RunServer(ctx)
-
  for i := 0; i < podCleanupWorkers; i++ {
  go wait.UntilWithContext(ctx, wfc.runPodCleanup, time.Second)
  }
@@ -323,6 +320,10 @@ func (wfc *WorkflowController) Run(ctx context.Context, wfWorkers, workflowTTLWo
  <-ctx.Done()
 }
 
+func (wfc *WorkflowController) RunMetricsServer(ctx context.Context, isDummy bool) {
+ go wfc.metrics.RunServer(ctx, isDummy)
+}
+
 // Create and the Synchronization Manager
 func (wfc *WorkflowController) createSynchronizationManager(ctx context.Context) {
  getSyncLimit := func(lockKey string) (int, error) {

diff --git a/workflow/metrics/server.go b/workflow/metrics/server.go
@@ -18,7 +18,8 @@ import (
 )
 
 // RunServer starts a metrics server
-func (m *Metrics) RunServer(ctx context.Context) {
+// If 'isDummy' is set to true, the dummy metrics server will be started. If it's false, the prometheus metrics server will be started
+func (m *Metrics) RunServer(ctx context.Context, isDummy bool) {
  defer runtimeutil.HandleCrash(runtimeutil.PanicHandlers...)
 
  if !m.metricsConfig.Enabled {
@@ -36,23 +37,33 @@ func (m *Metrics) RunServer(ctx context.Context) {
  // If the telemetry server is different -- and it's enabled -- run each on its own instance
  telemetryRegistry := prometheus.NewRegistry()
  telemetryRegistry.MustRegister(collectors.NewGoCollector())
- go runServer(m.telemetryConfig, telemetryRegistry, ctx)
+ go runServer(m.telemetryConfig, telemetryRegistry, ctx, isDummy)
  }
 
  // Run the metrics server
- go runServer(m.metricsConfig, metricsRegistry, ctx)
+ go runServer(m.metricsConfig, metricsRegistry, ctx, isDummy)
 
  go m.garbageCollector(ctx)
 }
 
-func runServer(config ServerConfig, registry *prometheus.Registry, ctx context.Context) {
+func runServer(config ServerConfig, registry *prometheus.Registry, ctx context.Context, isDummy bool) {
  var handlerOpts promhttp.HandlerOpts
  if config.IgnoreErrors {
  handlerOpts.ErrorHandling = promhttp.ContinueOnError
  }
 
+ name := ""
  mux := http.NewServeMux()
- mux.Handle(config.Path, promhttp.HandlerFor(registry, handlerOpts))
+ if isDummy {
+ // dummy metrics server responds to all requests with a 200 status, but without providing any metrics data
+ name = "dummy metrics server"
+ mux.HandleFunc(config.Path, func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ })
+ } else {
+ name = "prometheus metrics server"
+ mux.Handle(config.Path, promhttp.HandlerFor(registry, handlerOpts))
+ }
  srv := &http.Server{Addr: fmt.Sprintf(":%v", config.Port), Handler: mux}
 
  if config.Secure {
@@ -67,15 +78,15 @@ func runServer(config ServerConfig, registry *prometheus.Registry, ctx context.C
  }
  srv.TLSConfig = tlsConfig
  go func() {
- log.Infof("Starting prometheus metrics server at localhost:%v%s", config.Port, config.Path)
- if err := srv.ListenAndServeTLS("", ""); err != nil {
+ log.Infof("Starting %s at localhost:%v%s", name, config.Port, config.Path)
+ if err := srv.ListenAndServeTLS("", ""); err != http.ErrServerClosed {
  panic(err)
  }
  }()
  } else {
  go func() {
- log.Infof("Starting prometheus metrics server at localhost:%v%s", config.Port, config.Path)
- if err := srv.ListenAndServe(); err != nil {
+ log.Infof("Starting %s at localhost:%v%s", name, config.Port, config.Path)
+ if err := srv.ListenAndServe(); err != http.ErrServerClosed {
  panic(err)
  }
  }()
@@ -88,7 +99,9 @@ func runServer(config ServerConfig, registry *prometheus.Registry, ctx context.C
  ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
  defer cancel()
  if err := srv.Shutdown(ctx); err != nil {
- log.Infof("Unable to shutdown metrics server at localhost:%v%s", config.Port, config.Path)
+ log.Infof("Unable to shutdown %s at localhost:%v%s", name, config.Port, config.Path)
+ } else {
+ log.Infof("Successfully shutdown %s at localhost:%v%s", name, config.Port, config.Path)
  }
 }
 

diff --git a/workflow/metrics/server_test.go b/workflow/metrics/server_test.go
@@ -0,0 +1,51 @@
+package metrics
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestRunServer(t *testing.T) {
+ config := ServerConfig{
+ Enabled: true,
+ Path: DefaultMetricsServerPath,
+ Port: DefaultMetricsServerPort,
+ }
+ m := New(config, config)
+
+ server := func(isDummy bool, shouldBodyBeEmpty bool) {
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ go m.RunServer(ctx, isDummy)
+
+ resp, err := http.Get(fmt.Sprintf("http://localhost:%d%s", DefaultMetricsServerPort, DefaultMetricsServerPath))
+ assert.NoError(t, err)
+ defer resp.Body.Close()
+
+ bodyBytes, err := io.ReadAll(resp.Body)
+ assert.NoError(t, err)
+
+ bodyString := string(bodyBytes)
+
+ assert.Equal(t, http.StatusOK, resp.StatusCode)
+ if shouldBodyBeEmpty {
+ assert.Empty(t, bodyString)
+ } else {
+ assert.NotEmpty(t, bodyString)
+ }
+ }
+
+ t.Run("dummy metrics server", func(t *testing.T) {
+ server(true, true) // dummy metrics server does not provide any metrics responses
+ })
+
+ t.Run("prometheus metrics server", func(t *testing.T) {
+ server(false, false) // prometheus metrics server provides responses for any metrics
+ })
+}