From cba92a398eeeff922cee03459ae22115edfbad71 Mon Sep 17 00:00:00 2001 From: Mantas Sidlauskas Date: Wed, 4 May 2022 15:45:18 +0300 Subject: [PATCH] Improve failover coordinator error logging --- common/cache/domainCache.go | 21 +++++++++++---------- common/domain/failover_watcher.go | 8 ++++---- service/history/failover/coordinator.go | 9 +++++++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/common/cache/domainCache.go b/common/cache/domainCache.go index c9205e4f0d6..81055f0869a 100644 --- a/common/cache/domainCache.go +++ b/common/cache/domainCache.go @@ -252,7 +252,7 @@ func (c *domainCache) GetCacheSize() (sizeOfCacheByName int64, sizeOfCacheByID i return int64(c.cacheByID.Load().(Cache).Size()), int64(c.cacheNameToID.Load().(Cache).Size()) } -// Start start the background refresh of domain +// Start starts the background refresh of domain func (c *domainCache) Start() { if !atomic.CompareAndSwapInt32(&c.status, domainCacheInitialized, domainCacheStarted) { return @@ -266,7 +266,7 @@ func (c *domainCache) Start() { go c.refreshLoop() } -// Start start the background refresh of domain +// Stop stops background refresh of domain func (c *domainCache) Stop() { if !atomic.CompareAndSwapInt32(&c.status, domainCacheStarted, domainCacheStopped) { return @@ -348,8 +348,9 @@ func (c *domainCache) GetDomain( ) (*DomainCacheEntry, error) { if name == "" { - return nil, &types.BadRequestError{Message: "Domain is empty."} + return nil, &types.BadRequestError{Message: "Domain name is empty"} } + return c.getDomain(name) } @@ -452,7 +453,7 @@ func (c *domainCache) refreshDomainsLocked() error { if err != nil { return err } - domainNotificationVersion := metadata.NotificationVersion + var token []byte request := &persistence.ListDomainsRequest{PageSize: domainCacheRefreshPageSize} var domains DomainCacheEntries @@ -487,15 +488,17 @@ func (c *domainCache) refreshDomainsLocked() error { newCacheByID.Put(domain.info.ID, domain) } + scopedMetrics := c.metricsClient.Scope(metrics.DomainCacheScope) + UpdateLoop: for _, domain := range domains { - if domain.notificationVersion >= domainNotificationVersion { + if domain.notificationVersion >= metadata.NotificationVersion { // this guarantee that domain change events before the // domainNotificationVersion is loaded into the cache. // the domain change events after the domainNotificationVersion // will be loaded into cache in the next refresh - c.logger.Info("Received larger domain notification version", tag.WorkflowDomainName(domain.GetInfo().Name)) + c.logger.Info("Domain notification is not less than than metadata notification version", tag.WorkflowDomainName(domain.GetInfo().Name)) break UpdateLoop } triggerCallback, nextEntry, err := c.updateIDToDomainCache(newCacheByID, domain.info.ID, domain) @@ -503,7 +506,7 @@ UpdateLoop: return err } - c.metricsClient.Scope(metrics.DomainCacheScope).Tagged( + scopedMetrics.Tagged( metrics.DomainTag(nextEntry.info.Name), metrics.ActiveClusterTag(nextEntry.replicationConfig.ActiveClusterName), ).UpdateGauge(metrics.ActiveClusterGauge, 1) @@ -528,9 +531,7 @@ UpdateLoop: c.lastRefreshTime = now if now.Sub(c.lastCallbackEmitTime) > 30*time.Minute { c.lastCallbackEmitTime = now - for range c.callbacks { - c.metricsClient.IncCounter(metrics.DomainCacheScope, metrics.DomainCacheCallbacksCount) - } + scopedMetrics.AddCounter(metrics.DomainCacheCallbacksCount, int64(len(c.callbacks))) } return nil diff --git a/common/domain/failover_watcher.go b/common/domain/failover_watcher.go index 839e4d29342..25a9965e989 100644 --- a/common/domain/failover_watcher.go +++ b/common/domain/failover_watcher.go @@ -24,6 +24,7 @@ package domain import ( "context" + "fmt" "sync/atomic" "time" @@ -183,13 +184,12 @@ func CleanPendingActiveState( // this call has to be made metadata, err := domainManager.GetMetadata(context.Background()) if err != nil { - return err + return fmt.Errorf("getting metadata: %w", err) } - notificationVersion := metadata.NotificationVersion getResponse, err := domainManager.GetDomain(context.Background(), &persistence.GetDomainRequest{ID: domainID}) if err != nil { - return err + return fmt.Errorf("getting domain: %w", err) } localFailoverVersion := getResponse.FailoverVersion isGlobalDomain := getResponse.IsGlobalDomain @@ -205,7 +205,7 @@ func CleanPendingActiveState( FailoverVersion: localFailoverVersion, FailoverNotificationVersion: getResponse.FailoverNotificationVersion, FailoverEndTime: nil, - NotificationVersion: notificationVersion, + NotificationVersion: metadata.NotificationVersion, } op := func() error { return domainManager.UpdateDomain(context.Background(), updateReq) diff --git a/service/history/failover/coordinator.go b/service/history/failover/coordinator.go index 8925fe07823..2a15015927b 100644 --- a/service/history/failover/coordinator.go +++ b/service/history/failover/coordinator.go @@ -297,7 +297,10 @@ func (c *coordinatorImpl) handleFailoverMarkers( domainName, err := c.domainCache.GetDomainName(domainID) if err != nil { c.logger.Error("Coordinator failed to get domain after receiving all failover markers", - tag.WorkflowDomainID(domainID)) + tag.WorkflowDomainID(domainID), + tag.Error(err), + ) + c.scope.Tagged(metrics.DomainTag(domainName)).IncCounter(metrics.CadenceFailures) return } @@ -310,7 +313,9 @@ func (c *coordinatorImpl) handleFailoverMarkers( c.retryPolicy, ); err != nil { c.logger.Error("Coordinator failed to update domain after receiving all failover markers", - tag.WorkflowDomainID(domainID)) + tag.WorkflowDomainID(domainID), + tag.Error(err), + ) c.scope.IncCounter(metrics.CadenceFailures) return }