Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pubsub: Make batch request results independent #3457

Merged
merged 5 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 38 additions & 22 deletions pubsub/pubsub.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,17 +565,28 @@ func (s *Subscription) Receive(ctx context.Context) (_ *Message, err error) {
if s.preReceiveBatchHook != nil {
s.preReceiveBatchHook(batchSize)
}
msgs, err := s.getNextBatch(batchSize)
s.mu.Lock()
defer s.mu.Unlock()
if err != nil {
// Non-retryable error from ReceiveBatch -> permanent error.
s.err = err
} else if len(msgs) > 0 {
s.q = append(s.q, msgs...)
resultChannel := s.getNextBatch(batchSize)
for msgsOrError := range resultChannel {
if msgsOrError.msgs != nil && len(msgsOrError.msgs) > 0 {
// messages received from channel
s.mu.Lock()
s.q = append(s.q, msgsOrError.msgs...)
s.mu.Unlock()
// notify that queue should now have messages
s.waitc <- struct{}{}
} else if msgsOrError.err != nil {
// err can receive message only after batch group completes
// Non-retryable error from ReceiveBatch -> permanent error
s.mu.Lock()
s.err = msgsOrError.err
s.mu.Unlock()
}
}
// batch reception finished
s.mu.Lock()
close(s.waitc)
s.waitc = nil
s.mu.Unlock()
}()
}
if len(s.q) > 0 {
Expand Down Expand Up @@ -625,28 +636,31 @@ func (s *Subscription) Receive(ctx context.Context) (_ *Message, err error) {
}
// A call to ReceiveBatch must be in flight. Wait for it.
waitc := s.waitc
s.mu.Unlock()
s.mu.Unlock() // unlock to allow message or error processing from background goroutine
select {
case <-waitc:
s.mu.Lock()
// Continue to top of loop.
s.mu.Lock()
case <-ctx.Done():
s.mu.Lock()
return nil, ctx.Err()
}
}
}

// getNextBatch gets the next batch of messages from the server and returns it.
func (s *Subscription) getNextBatch(nMessages int) ([]*driver.Message, error) {
var mu sync.Mutex
var q []*driver.Message
type msgsOrError struct {
msgs []*driver.Message
err error
}

// getNextBatch gets the next batch of messages from the server. It will return a channel that will itself return the
// messages as they come from each independent batch, or an operation error
func (s *Subscription) getNextBatch(nMessages int) chan msgsOrError {
// Split nMessages into batches based on recvBatchOpts; we'll make a
// separate ReceiveBatch call for each batch, and aggregate the results in
// msgs.
batches := batcher.Split(nMessages, s.recvBatchOpts)

result := make(chan msgsOrError, len(batches))
g, ctx := errgroup.WithContext(s.backgroundCtx)
for _, maxMessagesInBatch := range batches {
// Make a copy of the loop variable since it will be used by a goroutine.
Expand All @@ -663,16 +677,18 @@ func (s *Subscription) getNextBatch(nMessages int) ([]*driver.Message, error) {
if err != nil {
return wrapError(s.driver, err)
}
mu.Lock()
defer mu.Unlock()
q = append(q, msgs...)
result <- msgsOrError{msgs: msgs}
return nil
})
}
if err := g.Wait(); err != nil {
return nil, err
}
return q, nil
go func() {
// wait on group completion on the background and proper channel closing
if err := g.Wait(); err != nil {
result <- msgsOrError{err: err}
}
close(result)
}()
return result
}

var errSubscriptionShutdown = gcerr.Newf(gcerr.FailedPrecondition, nil, "pubsub: Subscription has been Shutdown")
Expand Down
46 changes: 46 additions & 0 deletions pubsub/pubsub_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"net/url"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -281,6 +282,51 @@ func TestCancelTwoReceives(t *testing.T) {
}
}

type secondReceiveBlockedDriverSub struct {
driver.Subscription
waitDuration time.Duration
receiveCounter atomic.Uint64
}

func (s *secondReceiveBlockedDriverSub) ReceiveBatch(_ context.Context, _ int) ([]*driver.Message, error) {
s.receiveCounter.Add(1)
if s.receiveCounter.Load() > 1 {
// wait after 1st request for the specified duration before returning the batch result
<-time.After(s.waitDuration)
}
msg := &driver.Message{Body: []byte(fmt.Sprintf("message #%d", s.receiveCounter.Load()))}
return []*driver.Message{msg}, nil
}
func (*secondReceiveBlockedDriverSub) CanNack() bool { return false }
func (*secondReceiveBlockedDriverSub) IsRetryable(error) bool { return false }
func (*secondReceiveBlockedDriverSub) Close() error { return nil }

func TestIndependentBatchReturn(t *testing.T) {
// We want to test the scenario when multiple batch requests are sent, as long as one of them succeeds, it should
// not block the Subscription.Receive result
receiveWaitDuration := 200 * time.Millisecond
s := NewSubscription(
&secondReceiveBlockedDriverSub{waitDuration: receiveWaitDuration},
&batcher.Options{MaxBatchSize: 1, MaxHandlers: 2}, // force 2 batches, by allowing 2 handlers and 1 msg per batch
nil,
)
// set the false calculated subscription batch size to force 2 batches to be called
s.runningBatchSize = 2
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The hardcoded timings might be racy.

Could secondReceiveBlockedDriverSub just wait forever (or maybe until the passed-in ctx is Done) on the second call, and s.Receive wait forever (no context.WithTimeout)? Without your change, the test would hang and eventually timeout; after it, I think the receive should work ~right away (getting the first message) and then exit successfully.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I opted in for context timeout because in the case of failure, I didn't want the test to hang for a long amount of time and get a faster failure.

Anyway, I changed it to how you proposed with the wait-forever approach and made sure that on master the test hangs.

defer cancel()
defer s.Shutdown(ctx)
start := time.Now()
_, err := s.Receive(ctx)
if err != nil {
t.Fatal("Receive should not fail", err)
return
}
receiveDuration := time.Since(start)
if receiveDuration > receiveWaitDuration {
t.Error("Receive should not be blocked by hanging batch request")
}
}

func TestRetryTopic(t *testing.T) {
// Test that Send is retried if the driver returns a retryable error.
ctx := context.Background()
Expand Down
Loading