From a774f2017b2e0a396fdd1b791b1a7b554148732b Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Sun, 17 Mar 2019 23:22:02 -0700 Subject: [PATCH 1/8] Fix a locking issue in the Rollback manager --- vault/rollback.go | 125 ++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/vault/rollback.go b/vault/rollback.go index 9148335289b1..36a2e8573b03 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -59,6 +59,18 @@ type RollbackManager struct { type rollbackState struct { lastError error sync.WaitGroup + once sync.Once + rollbackFunc func(context.Context) error +} + +// Run the rollback once, retrun true if we were the one that ran it. Caller +// should hold the statelock. +func (rs *rollbackState) run(ctx context.Context) (ran bool, err error) { + rs.once.Do(func() { + ran = true + err = rs.rollbackFunc(ctx) + }) + return } // NewRollbackManager is used to create a new rollback manager @@ -132,24 +144,62 @@ func (m *RollbackManager) triggerRollbacks() { } fullPath := e.namespace.Path + path - m.inflightLock.RLock() - _, ok := m.inflight[fullPath] - m.inflightLock.RUnlock() - if !ok { - m.startRollback(ctx, fullPath, true) - } + // Start a rollback if necessary + m.startOrLookupRollback(ctx, fullPath, true) } } // startRollback is used to start an async rollback attempt. // This must be called with the inflightLock held. -func (m *RollbackManager) startRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { - rs := &rollbackState{} - rs.Add(1) - m.inflightAll.Add(1) +func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { + rs := &rollbackState{ + rollbackFunc: func(ctx context.Context) error { + ns, err := namespace.FromContext(ctx) + if err != nil { + return err + } + if ns == nil { + return namespace.ErrNoNamespace + } + + // Invoke a RollbackOperation + req := &logical.Request{ + Operation: logical.RollbackOperation, + Path: ns.TrimmedPath(fullPath), + } + + var cancelFunc context.CancelFunc + ctx, cancelFunc = context.WithTimeout(ctx, DefaultMaxRequestDuration) + _, err = m.router.Route(ctx, req) + cancelFunc() + + // If the error is an unsupported operation, then it doesn't + // matter, the backend doesn't support it. + if err == logical.ErrUnsupportedOperation { + err = nil + } + // If we failed due to read-only storage, we can't do anything; ignore + if err != nil && strings.Contains(err.Error(), logical.ErrReadOnly.Error()) { + err = nil + } + if err != nil { + m.logger.Error("error rolling back", "path", fullPath, "error", err) + } + return nil + }, + } + m.inflightLock.Lock() + defer m.inflightLock.Unlock() + rsInflight, ok := m.inflight[fullPath] + if ok { + return rsInflight + } + + // If no inflight rollback is already running, kick one off m.inflight[fullPath] = rs - m.inflightLock.Unlock() + rs.Add(1) + m.inflightAll.Add(1) go m.attemptRollback(ctx, fullPath, rs, grabStatelock) return rs } @@ -170,20 +220,6 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, m.inflightLock.Unlock() }() - ns, err := namespace.FromContext(ctx) - if err != nil { - return err - } - if ns == nil { - return namespace.ErrNoNamespace - } - - // Invoke a RollbackOperation - req := &logical.Request{ - Operation: logical.RollbackOperation, - Path: ns.TrimmedPath(fullPath), - } - if grabStatelock { // Grab the statelock or stop if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, m.shutdownCh); stopped { @@ -191,26 +227,13 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, } } - var cancelFunc context.CancelFunc - ctx, cancelFunc = context.WithTimeout(ctx, DefaultMaxRequestDuration) - _, err = m.router.Route(ctx, req) + // Run the rollback + _, err = rs.run(ctx) + if grabStatelock { m.core.stateLock.RUnlock() } - cancelFunc() - // If the error is an unsupported operation, then it doesn't - // matter, the backend doesn't support it. - if err == logical.ErrUnsupportedOperation { - err = nil - } - // If we failed due to read-only storage, we can't do anything; ignore - if err != nil && strings.Contains(err.Error(), logical.ErrReadOnly.Error()) { - err = nil - } - if err != nil { - m.logger.Error("error rolling back", "path", fullPath, "error", err) - } return } @@ -224,15 +247,21 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { } fullPath := ns.Path + path - // Check for an existing attempt and start one if none - m.inflightLock.RLock() - rs, ok := m.inflight[fullPath] - m.inflightLock.RUnlock() - if !ok { - rs = m.startRollback(ctx, fullPath, false) + // Check for an existing attempt or start one if none + rs := m.startOrLookupRollback(ctx, fullPath, false) + + // Do a run here in the event an allready inflight rollback is blocked on + // grabbing the statelock. This prevents a deadlock in some cases where the + // caller of this function holds the write statelock. + ran, err := rs.run(ctx) + // If we were the runner, return the error + if ran { + return err } - // Wait for the attempt to finish + // If we weren't the runner, wait for the inflight attempt to finish. It's + // safe to do this, since if the other thread starts the run they are + // already in possession of the statelock and we are not deadlocked. rs.Wait() // Return the last error From 3a2d641c0f0cb49d2e592ed57a93b16b87463c99 Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Sun, 17 Mar 2019 23:27:39 -0700 Subject: [PATCH 2/8] Update rollback.go --- vault/rollback.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vault/rollback.go b/vault/rollback.go index 36a2e8573b03..9c984c188626 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -63,7 +63,7 @@ type rollbackState struct { rollbackFunc func(context.Context) error } -// Run the rollback once, retrun true if we were the one that ran it. Caller +// Run the rollback once, return true if we were the one that ran it. Caller // should hold the statelock. func (rs *rollbackState) run(ctx context.Context) (ran bool, err error) { rs.once.Do(func() { From 73e35ebcdc249e225dc7606125d49fed9ac66f8e Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Sun, 17 Mar 2019 23:34:13 -0700 Subject: [PATCH 3/8] Update rollback.go --- vault/rollback.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vault/rollback.go b/vault/rollback.go index 9c984c188626..1031decb2bdc 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -250,7 +250,7 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { // Check for an existing attempt or start one if none rs := m.startOrLookupRollback(ctx, fullPath, false) - // Do a run here in the event an allready inflight rollback is blocked on + // Do a run here in the event an already inflight rollback is blocked on // grabbing the statelock. This prevents a deadlock in some cases where the // caller of this function holds the write statelock. ran, err := rs.run(ctx) From f73659024436ce62d4e95b29b482235df072576d Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Mon, 18 Mar 2019 08:39:03 -0700 Subject: [PATCH 4/8] move state creation --- vault/rollback.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vault/rollback.go b/vault/rollback.go index 36a2e8573b03..15209f0f1d7a 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -152,6 +152,13 @@ func (m *RollbackManager) triggerRollbacks() { // startRollback is used to start an async rollback attempt. // This must be called with the inflightLock held. func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { + m.inflightLock.Lock() + defer m.inflightLock.Unlock() + rsInflight, ok := m.inflight[fullPath] + if ok { + return rsInflight + } + rs := &rollbackState{ rollbackFunc: func(ctx context.Context) error { ns, err := namespace.FromContext(ctx) @@ -189,13 +196,6 @@ func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath st }, } - m.inflightLock.Lock() - defer m.inflightLock.Unlock() - rsInflight, ok := m.inflight[fullPath] - if ok { - return rsInflight - } - // If no inflight rollback is already running, kick one off m.inflight[fullPath] = rs rs.Add(1) From 52fb167964f8bb907c8b18f331fa50a5a72f4100 Mon Sep 17 00:00:00 2001 From: Michel Vocks Date: Mon, 18 Mar 2019 08:40:32 -0700 Subject: [PATCH 5/8] Update vault/rollback.go Co-Authored-By: briankassouf --- vault/rollback.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vault/rollback.go b/vault/rollback.go index f271b3593c46..65ac3bde8d98 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -149,7 +149,7 @@ func (m *RollbackManager) triggerRollbacks() { } } -// startRollback is used to start an async rollback attempt. +// startOrLookupRollback is used to start an async rollback attempt. // This must be called with the inflightLock held. func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { m.inflightLock.Lock() From cc1ec953ee1a9d0d8e51fcd7dbc674c39e2129d5 Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Mon, 18 Mar 2019 10:20:40 -0700 Subject: [PATCH 6/8] Simplify logic by canceling the lock grab --- vault/rollback.go | 129 +++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 64 deletions(-) diff --git a/vault/rollback.go b/vault/rollback.go index f271b3593c46..d7d010870e21 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -59,18 +59,7 @@ type RollbackManager struct { type rollbackState struct { lastError error sync.WaitGroup - once sync.Once - rollbackFunc func(context.Context) error -} - -// Run the rollback once, return true if we were the one that ran it. Caller -// should hold the statelock. -func (rs *rollbackState) run(ctx context.Context) (ran bool, err error) { - rs.once.Do(func() { - ran = true - err = rs.rollbackFunc(ctx) - }) - return + cancelLockGrabCh chan struct{} } // NewRollbackManager is used to create a new rollback manager @@ -160,40 +149,7 @@ func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath st } rs := &rollbackState{ - rollbackFunc: func(ctx context.Context) error { - ns, err := namespace.FromContext(ctx) - if err != nil { - return err - } - if ns == nil { - return namespace.ErrNoNamespace - } - - // Invoke a RollbackOperation - req := &logical.Request{ - Operation: logical.RollbackOperation, - Path: ns.TrimmedPath(fullPath), - } - - var cancelFunc context.CancelFunc - ctx, cancelFunc = context.WithTimeout(ctx, DefaultMaxRequestDuration) - _, err = m.router.Route(ctx, req) - cancelFunc() - - // If the error is an unsupported operation, then it doesn't - // matter, the backend doesn't support it. - if err == logical.ErrUnsupportedOperation { - err = nil - } - // If we failed due to read-only storage, we can't do anything; ignore - if err != nil && strings.Contains(err.Error(), logical.ErrReadOnly.Error()) { - err = nil - } - if err != nil { - m.logger.Error("error rolling back", "path", fullPath, "error", err) - } - return nil - }, + cancelLockGrabCh: make(chan struct{}), } // If no inflight rollback is already running, kick one off @@ -220,26 +176,76 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, m.inflightLock.Unlock() }() + ns, err := namespace.FromContext(ctx) + if err != nil { + return err + } + if ns == nil { + return namespace.ErrNoNamespace + } + + // Invoke a RollbackOperation + req := &logical.Request{ + Operation: logical.RollbackOperation, + Path: ns.TrimmedPath(fullPath), + } + + releaseLock := true if grabStatelock { + doneCh := make(chan struct{}) + defer close(doneCh) + + stopCh := make(chan struct{}) + go func() { + defer close(stopCh) + + select { + case <-m.shutdownCh: + case <-rs.cancelLockGrabCh: + case <-doneCh: + } + }() + // Grab the statelock or stop - if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, m.shutdownCh); stopped { - return errors.New("rollback shutting down") + if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh); stopped { + // If we stopped due to shutdown, return. Otherwise another thread + // is holding the lock for us, continue on. + select { + case <-m.shutdownCh: + return errors.New("rollback shutting down") + default: + releaseLock = false + } } } - // Run the rollback - _, err = rs.run(ctx) - - if grabStatelock { + var cancelFunc context.CancelFunc + ctx, cancelFunc = context.WithTimeout(ctx, DefaultMaxRequestDuration) + _, err = m.router.Route(ctx, req) + if grabStatelock && releaseLock { m.core.stateLock.RUnlock() } + cancelFunc() + // If the error is an unsupported operation, then it doesn't + // matter, the backend doesn't support it. + if err == logical.ErrUnsupportedOperation { + err = nil + } + // If we failed due to read-only storage, we can't do anything; ignore + if err != nil && strings.Contains(err.Error(), logical.ErrReadOnly.Error()) { + err = nil + } + if err != nil { + m.logger.Error("error rolling back", "path", fullPath, "error", err) + } return } // Rollback is used to trigger an immediate rollback of the path, // or to join an existing rollback operation if in flight. Caller should have -// core's statelock held +// core's statelock held (write OR read). If an already inflight rollback is +// happening this function will simply wait for it to complete func (m *RollbackManager) Rollback(ctx context.Context, path string) error { ns, err := namespace.FromContext(ctx) if err != nil { @@ -250,18 +256,13 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { // Check for an existing attempt or start one if none rs := m.startOrLookupRollback(ctx, fullPath, false) - // Do a run here in the event an already inflight rollback is blocked on - // grabbing the statelock. This prevents a deadlock in some cases where the - // caller of this function holds the write statelock. - ran, err := rs.run(ctx) - // If we were the runner, return the error - if ran { - return err - } + // Since we have the statelock held, tell any inflight rollback to give up + // trying to aquire it. This will prevent deadlocks in the case where we + // have the write lock. + close(rs.cancelLockGrabCh) - // If we weren't the runner, wait for the inflight attempt to finish. It's - // safe to do this, since if the other thread starts the run they are - // already in possession of the statelock and we are not deadlocked. + // It's safe to do this, since the other thread either already has the lock + // held, or we just canceled it above. rs.Wait() // Return the last error From 5946e14932757ec1f8020483cadbe2b131f90399 Mon Sep 17 00:00:00 2001 From: Brian Kassouf Date: Mon, 18 Mar 2019 10:48:41 -0700 Subject: [PATCH 7/8] Use context instead of a chan --- vault/rollback.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vault/rollback.go b/vault/rollback.go index 4932999db50f..a879434743f5 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -59,7 +59,8 @@ type RollbackManager struct { type rollbackState struct { lastError error sync.WaitGroup - cancelLockGrabCh chan struct{} + cancelLockGrabCtx context.Context + cancelLockGrabCtxCancel context.CancelFunc } // NewRollbackManager is used to create a new rollback manager @@ -148,8 +149,10 @@ func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath st return rsInflight } + cancelCtx, cancelFunc := context.WithCancel(context.Background()) rs := &rollbackState{ - cancelLockGrabCh: make(chan struct{}), + cancelLockGrabCtx: cancelCtx, + cancelLockGrabCtxCancel: cancelFunc, } // If no inflight rollback is already running, kick one off @@ -201,7 +204,7 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, select { case <-m.shutdownCh: - case <-rs.cancelLockGrabCh: + case <-rs.cancelLockGrabCtx.Done(): case <-doneCh: } }() @@ -259,7 +262,7 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { // Since we have the statelock held, tell any inflight rollback to give up // trying to aquire it. This will prevent deadlocks in the case where we // have the write lock. - close(rs.cancelLockGrabCh) + rs.cancelLockGrabCtxCancel() // It's safe to do this, since the other thread either already has the lock // held, or we just canceled it above. From 3b5b9f507e34984e0a8ecefae82ebfed92d087e6 Mon Sep 17 00:00:00 2001 From: Jeff Mitchell Date: Mon, 18 Mar 2019 14:00:29 -0400 Subject: [PATCH 8/8] Update vault/rollback.go --- vault/rollback.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vault/rollback.go b/vault/rollback.go index a879434743f5..9dad02acdf7d 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -261,7 +261,9 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { // Since we have the statelock held, tell any inflight rollback to give up // trying to aquire it. This will prevent deadlocks in the case where we - // have the write lock. + // have the write lock. In the case where it was waiting to grab + // a read lock it will then simply continue with the rollback + // operation under the protection of our write lock. rs.cancelLockGrabCtxCancel() // It's safe to do this, since the other thread either already has the lock