Skip to content

Commit

Permalink
[Cosmos] Implements Client Retry policy (Azure#22394)
Browse files Browse the repository at this point in the history
* implementation of client retry policy

* ignore N-2 on ci

* Update ci.yml

* changes to pass ci

* Update go.mod

* Update go.sum

* make method private, add test

* enableEndpointDiscovery->enableCrossRegionRetries, remove public area change, remove duplicates

* saved constants, moved logic around in policy for non-duplicity

* added partial tests, missing 503s/ connectivity issues handling

* finalizing behavior and tests

* revert pipeline useragent, return non-retryable errors to skip Core retries

* mark create/delete management plane operations as writes

* force refresh ability added, delete/replace operations marked as write

* remove print statements

* refactor

* missing comma

* detecting dns failures

* missing update

* deal with errors fetching initial account information

* linter

* more linter

* Update cosmos_client_retry_policy_test.go

* add DNS test

* fix error handling logic for dns

* small fix to ensure no wrong index is called

* fix new locking logic

* override header for response on write metadata operations

---------

Co-authored-by: Matias Quaranta <ealsur@hotmail.com>
  • Loading branch information
simorenoh and ealsur authored Mar 15, 2024
1 parent f4a9a18 commit 745ca8f
Show file tree
Hide file tree
Showing 16 changed files with 857 additions and 55 deletions.
3 changes: 2 additions & 1 deletion sdk/data/azcosmos/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ stages:
parameters:
ServiceDirectory: 'data/azcosmos'
UsePipelineProxy: false
ExcludeGoNMinus2: true
- stage: Emulator
displayName: 'Cosmos Emulator'
variables:
Expand All @@ -38,7 +39,7 @@ stages:
Windows_Go120:
pool.name: azsdk-pool-mms-win-2022-general
image.name: MMS2022
go.version: '1.21.1'
go.version: '1.22.0'
pool:
name: $(pool.name)
vmImage: $(image.name)
Expand Down
23 changes: 16 additions & 7 deletions sdk/data/azcosmos/cosmos_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (

"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming"
)
Expand All @@ -42,10 +41,11 @@ func (c *Client) Endpoint() string {
// options - Optional Cosmos client options. Pass nil to accept default values.
func NewClientWithKey(endpoint string, cred KeyCredential, o *ClientOptions) (*Client, error) {
preferredRegions := []string{}
enableCrossRegionRetries := true
if o != nil {
preferredRegions = o.PreferredRegions
}
gem, err := newGlobalEndpointManager(endpoint, newInternalPipeline(newSharedKeyCredPolicy(cred), o), preferredRegions, 0)
gem, err := newGlobalEndpointManager(endpoint, newInternalPipeline(newSharedKeyCredPolicy(cred), o), preferredRegions, 0, enableCrossRegionRetries)
if err != nil {
return nil, err
}
Expand All @@ -62,10 +62,11 @@ func NewClient(endpoint string, cred azcore.TokenCredential, o *ClientOptions) (
return nil, err
}
preferredRegions := []string{}
enableCrossRegionRetries := true
if o != nil {
preferredRegions = o.PreferredRegions
}
gem, err := newGlobalEndpointManager(endpoint, newInternalPipeline(newCosmosBearerTokenPolicy(cred, scope, nil), o), preferredRegions, 0)
gem, err := newGlobalEndpointManager(endpoint, newInternalPipeline(newCosmosBearerTokenPolicy(cred, scope, nil), o), preferredRegions, 0, enableCrossRegionRetries)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -124,6 +125,7 @@ func newPipeline(authPolicy policy.Policy, gem *globalEndpointManager, options *
},
PerRetry: []policy.Policy{
authPolicy,
&clientRetryPolicy{gem: gem},
},
},
&options.ClientOptions)
Expand Down Expand Up @@ -193,10 +195,17 @@ func (c *Client) CreateDatabase(
if o == nil {
o = &CreateDatabaseOptions{}
}
returnResponse := true
h := &headerOptionsOverride{
enableContentResponseOnWrite: &returnResponse,
}

operationContext := pipelineRequestOptions{
resourceType: resourceTypeDatabase,
resourceAddress: ""}
resourceType: resourceTypeDatabase,
resourceAddress: "",
isWriteOperation: true,
headerOptionsOverride: h,
}

path, err := generatePathForNameBased(resourceTypeDatabase, "", true)
if err != nil {
Expand All @@ -220,7 +229,7 @@ func (c *Client) CreateDatabase(
// NewQueryDatabasesPager executes query for databases.
// query - The SQL query to execute.
// o - Options for the operation.
func (c *Client) NewQueryDatabasesPager(query string, o *QueryDatabasesOptions) *runtime.Pager[QueryDatabasesResponse] {
func (c *Client) NewQueryDatabasesPager(query string, o *QueryDatabasesOptions) *azruntime.Pager[QueryDatabasesResponse] {
queryOptions := &QueryDatabasesOptions{}
if o != nil {
originalOptions := *o
Expand All @@ -234,7 +243,7 @@ func (c *Client) NewQueryDatabasesPager(query string, o *QueryDatabasesOptions)

path, _ := generatePathForNameBased(resourceTypeDatabase, operationContext.resourceAddress, true)

return runtime.NewPager(runtime.PagingHandler[QueryDatabasesResponse]{
return azruntime.NewPager(azruntime.PagingHandler[QueryDatabasesResponse]{
More: func(page QueryDatabasesResponse) bool {
return page.ContinuationToken != ""
},
Expand Down
187 changes: 187 additions & 0 deletions sdk/data/azcosmos/cosmos_client_retry_policy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

package azcosmos

import (
"errors"
"fmt"
"net"
"net/http"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/internal/errorinfo"
)

type clientRetryPolicy struct {
gem *globalEndpointManager
useWriteEndpoint bool
retryCount int
sessionRetryCount int
preferredLocationIndex int
}

const maxRetryCount = 120
const defaultBackoff = 1

func (p *clientRetryPolicy) Do(req *policy.Request) (*http.Response, error) {
p.resetPolicyCounters()
o := pipelineRequestOptions{}
if !req.OperationValue(&o) {
return nil, fmt.Errorf("failed to obtain request options, please check request being sent: %s", req.Body())
}
for {
resolvedEndpoint := p.gem.ResolveServiceEndpoint(p.retryCount, o.isWriteOperation, p.useWriteEndpoint)
req.Raw().Host = resolvedEndpoint.Host
req.Raw().URL.Host = resolvedEndpoint.Host
response, err := req.Next() // err can happen in weird scenarios (connectivity, etc)
if err != nil {
if p.isNetworkConnectionError(err) {
shouldRetry, errRetry := p.attemptRetryOnNetworkError(req)
if errRetry != nil {
return nil, errRetry
}
if !shouldRetry {
return nil, err
}
err = req.RewindBody()
if err != nil {
return nil, err
}
p.retryCount += 1
continue
}
return nil, err
}
subStatus := response.Header.Get(cosmosHeaderSubstatus)
if p.shouldRetryStatus(response.StatusCode, subStatus) {
p.useWriteEndpoint = false
if response.StatusCode == http.StatusForbidden {
shouldRetry, err := p.attemptRetryOnEndpointFailure(req, o.isWriteOperation)
if err != nil {
return nil, err
}
if !shouldRetry {
return nil, errorinfo.NonRetriableError(azruntime.NewResponseErrorWithErrorCode(response, response.Status))
}
} else if response.StatusCode == http.StatusNotFound {
if !p.attemptRetryOnSessionUnavailable(req, o.isWriteOperation) {
return nil, errorinfo.NonRetriableError(azruntime.NewResponseErrorWithErrorCode(response, response.Status))
}
} else if response.StatusCode == http.StatusServiceUnavailable {
if !p.attemptRetryOnServiceUnavailable(req, o.isWriteOperation) {
return nil, errorinfo.NonRetriableError(azruntime.NewResponseErrorWithErrorCode(response, response.Status))
}
}
err = req.RewindBody()
if err != nil {
return response, err
}
p.retryCount += 1
continue
}

return response, err
}

}

func (p *clientRetryPolicy) shouldRetryStatus(status int, subStatus string) (shouldRetry bool) {
if (status == http.StatusForbidden && (subStatus == subStatusWriteForbidden || subStatus == subStatusDatabaseAccountNotFound)) ||
(status == http.StatusNotFound && subStatus == subStatusReadSessionNotAvailable) ||
(status == http.StatusServiceUnavailable) {
return true
}
return false
}

func (p *clientRetryPolicy) attemptRetryOnNetworkError(req *policy.Request) (bool, error) {
if (p.retryCount > maxRetryCount) || !p.gem.locationCache.enableCrossRegionRetries {
return false, nil
}

err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL)
if err != nil {
return false, err
}
err = p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL)
if err != nil {
return false, err
}
err = p.gem.Update(req.Raw().Context(), false)
if err != nil {
return false, err
}

time.Sleep(defaultBackoff * time.Second)
return true, nil
}

func (p *clientRetryPolicy) attemptRetryOnEndpointFailure(req *policy.Request, isWriteOperation bool) (bool, error) {
if (p.retryCount > maxRetryCount) || !p.gem.locationCache.enableCrossRegionRetries {
return false, nil
}
if isWriteOperation {
err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL)
if err != nil {
return false, err
}
} else {
err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL)
if err != nil {
return false, err
}
}

err := p.gem.Update(req.Raw().Context(), isWriteOperation)
if err != nil {
return false, err
}

time.Sleep(defaultBackoff * time.Second)
return true, nil
}

func (p *clientRetryPolicy) attemptRetryOnSessionUnavailable(req *policy.Request, isWriteOperation bool) bool {
if p.gem.CanUseMultipleWriteLocations() {
endpoints := p.gem.locationCache.locationInfo.availReadLocations
if isWriteOperation {
endpoints = p.gem.locationCache.locationInfo.availWriteLocations
}
if p.sessionRetryCount >= len(endpoints) {
return false
}
} else {
if p.sessionRetryCount > 0 {
return false
}
p.useWriteEndpoint = true
}
p.sessionRetryCount += 1
return true
}

func (p *clientRetryPolicy) attemptRetryOnServiceUnavailable(req *policy.Request, isWriteOperation bool) bool {
if !p.gem.locationCache.enableCrossRegionRetries || p.preferredLocationIndex >= len(p.gem.preferredLocations) {
return false
}
if isWriteOperation && !p.gem.CanUseMultipleWriteLocations() {
return false
}
p.preferredLocationIndex += 1
return true
}

func (p *clientRetryPolicy) resetPolicyCounters() {
p.retryCount = 0
p.sessionRetryCount = 0
p.preferredLocationIndex = 0
}

// isNetworkConnectionError checks if the error is related to failure to connect / resolve DNS
func (p *clientRetryPolicy) isNetworkConnectionError(err error) bool {
var dnserror *net.DNSError
return errors.As(err, &dnserror)
}
Loading

0 comments on commit 745ca8f

Please sign in to comment.