Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize metrics for task isolation leaking and include cause #6562

Merged
merged 1 commit into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 7 additions & 43 deletions common/isolationgroup/defaultisolationgroupstate/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,12 @@ func NewDefaultIsolationGroupStateWatcherWithConfigStoreClient(
}, nil
}

func (z *defaultIsolationGroupStateHandler) AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availablePollerIsolationGroups []string) (types.IsolationGroupConfiguration, error) {
func (z *defaultIsolationGroupStateHandler) IsolationGroupsByDomainID(ctx context.Context, domainID string) (types.IsolationGroupConfiguration, error) {
state, err := z.getByDomainID(ctx, domainID)
if err != nil {
return nil, fmt.Errorf("unable to get isolation group state: %w", err)
}
availableIsolationGroupsCfg := isolationGroupHealthyListToConfig(availablePollerIsolationGroups)
scope := z.createAvailableisolationGroupMetricsScope(domainID, tasklistName)
return availableIG(z.config.AllIsolationGroups(), availableIsolationGroupsCfg, state.Global, state.Domain, scope), nil
return toIsolationGroupConfiguration(z.config.AllIsolationGroups(), state.Global, state.Domain), nil
}

func (z *defaultIsolationGroupStateHandler) IsDrained(ctx context.Context, domain string, isolationGroup string) (bool, error) {
Expand Down Expand Up @@ -162,44 +160,21 @@ func (z *defaultIsolationGroupStateHandler) get(ctx context.Context, domain stri
return ig, nil
}

func (z *defaultIsolationGroupStateHandler) createAvailableisolationGroupMetricsScope(domainID string, tasklistName string) metrics.Scope {
domainName, _ := z.domainCache.GetDomainName(domainID)
return z.metricsClient.Scope(metrics.GetAvailableIsolationGroupsScope).
Tagged(metrics.DomainTag(domainName)).
Tagged(metrics.TaskListTag(tasklistName))
}

// A simple explicit deny-based isolation group implementation
func availableIG(
func toIsolationGroupConfiguration(
allIsolationGroups []string,
availablePollers types.IsolationGroupConfiguration,
global types.IsolationGroupConfiguration,
domain types.IsolationGroupConfiguration,
scope metrics.Scope,
) types.IsolationGroupConfiguration {
out := types.IsolationGroupConfiguration{}
for _, isolationGroup := range allIsolationGroups {
_, hasAvailablePollers := availablePollers[isolationGroup]
globalCfg, hasGlobalConfig := global[isolationGroup]
domainCfg, hasDomainConfig := domain[isolationGroup]
if hasGlobalConfig {
if globalCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
continue
}
}
if hasDomainConfig {
if domainCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
continue
if isDrained(isolationGroup, global, domain) {
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateDrained,
}
}
if !hasAvailablePollers {
// we don't attempt to dispatch tasks to isolation groups where there are no pollers
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStatePollerUnavailable)
continue
}
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateHealthy)
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateHealthy,
Expand All @@ -223,14 +198,3 @@ func isDrained(isolationGroup string, global types.IsolationGroupConfiguration,
}
return false
}

func isolationGroupHealthyListToConfig(igs []string) types.IsolationGroupConfiguration {
out := make(types.IsolationGroupConfiguration, len(igs))
for _, ig := range igs {
out[ig] = types.IsolationGroupPartition{
Name: ig,
State: types.IsolationGroupStateHealthy,
}
}
return out
}
112 changes: 33 additions & 79 deletions common/isolationgroup/defaultisolationgroupstate/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,21 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
json.Unmarshal(validCfgDataDrained, &dynamicConfigResponseDrained)

tests := map[string]struct {
availablePollerIsolationGroups []string
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
}{
"normal case - feature is disabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return false },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Expand All @@ -106,10 +103,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - no drains present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -120,7 +116,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -136,10 +131,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - one drain present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -150,21 +144,23 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Name: "zone-1",
State: types.IsolationGroupStateDrained,
},
"zone-2": {
Name: "zone-2",
State: types.IsolationGroupStateHealthy,
},
},
},
"expected case - no global drain data configured": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -175,7 +171,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -190,7 +185,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},
},
"pathological case - problems with global drain data - 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -210,7 +204,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - cannot resolve domain": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -224,7 +217,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"pathological case - problems with global drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -244,7 +236,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -260,7 +251,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: a failure"),
},
"pathological case - problems with domain drain data - malformed data returned 2": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -275,26 +265,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expected: nil,
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: %!w(<nil>)"),
},
"pathological case - no available pollers": {
availablePollerIsolationGroups: nil,
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
dynamicconfig.DefaultIsolationGroupConfigStoreManagerGlobalMapping,
gomock.Any(),
).Return(nil, nil)
},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{},
},
}

for name, td := range tests {
Expand All @@ -311,7 +281,7 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
config: td.cfg,
metricsClient: metrics.NewNoopMetricsClient(),
}
res, err := handler.AvailableIsolationGroupsByDomainID(context.TODO(), "domain-id", "a-tasklist", td.availablePollerIsolationGroups)
res, err := handler.IsolationGroupsByDomainID(context.TODO(), "domain-id")
assert.Equal(t, td.expected, res)
if td.expectedErr != nil {
assert.Equal(t, td.expectedErr.Error(), err.Error())
Expand Down Expand Up @@ -445,7 +415,7 @@ func TestAvailableIsolationGroups(t *testing.T) {
},
}

isolationGroupsSetB := types.IsolationGroupConfiguration{
isolationGroupsWithCDrained := types.IsolationGroupConfiguration{
igA: {
Name: igA,
State: types.IsolationGroupStateHealthy,
Expand All @@ -454,60 +424,44 @@ func TestAvailableIsolationGroups(t *testing.T) {
Name: igB,
State: types.IsolationGroupStateHealthy,
},
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

isolationGroupsSetC := types.IsolationGroupConfiguration{
drainedC := types.IsolationGroupConfiguration{
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

tests := map[string]struct {
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
availablePollers types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
}{
"default behaviour - no drains - everything should be healthy": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
expected: isolationGroupsAllHealthy,
},
"default behaviour - no drains - only one zone is healthy in terms of pollers, should only return that": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: isolationGroupsAllHealthy,
},
"default behaviour - one is drained - should return remaining 1/2": {
globalIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: isolationGroupsSetC, // C is drained
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at domain level": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: drainedC, // C is drained
expected: isolationGroupsWithCDrained, // A and B
},
"default behaviour - one is drained - should return remaining 2/2": {
globalIGCfg: isolationGroupsSetC, // C is drained
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at global level": {
globalIGCfg: drainedC, // C is drained
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsWithCDrained, // A and B
},
}

for name, td := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, td.expected, availableIG(all, td.availablePollers, td.globalIGCfg, td.domainIGCfg, metrics.NewNoopMetricsClient().Scope(0)))
assert.Equal(t, td.expected, toIsolationGroupConfiguration(all, td.globalIGCfg, td.domainIGCfg))
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion common/isolationgroup/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ type State interface {

// AvailableIsolationGroupsByDomainID returns the available isolation zones for a domain.
// Takes into account global and domain zones
AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availableIsolationGroups []string) (types.IsolationGroupConfiguration, error)
IsolationGroupsByDomainID(ctx context.Context, domainID string) (types.IsolationGroupConfiguration, error)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe mind adding an updated comment about what the purpose of the interface is? eg "it's responsibility as a component is to return the isolation group for a domain"?

}
30 changes: 15 additions & 15 deletions common/isolationgroup/isolation_group_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading