From 37be5fa95518c6fd1c856b58a827d37b98d9ed8f Mon Sep 17 00:00:00 2001 From: Edward Welch Date: Thu, 9 Jul 2020 14:37:23 -0400 Subject: [PATCH 1/3] Loki: Use a new context to update the ring state after a failed chunk transfer --- pkg/ingester/transfer.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index ac30baf5dfe6..16c488effd54 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -59,10 +59,19 @@ func (i *Ingester) TransferChunks(stream logproto.Ingester_TransferChunksServer) // Enter PENDING state (only valid from JOINING) if i.lifecycler.GetState() == ring.JOINING { - if err := i.lifecycler.ChangeState(stream.Context(), ring.PENDING); err != nil { - level.Error(logger).Log("msg", "error rolling back failed TransferChunks", "err", err) + // Create a new context here to attempt to update the state back to pending to allow + // a failed transfer to try again. If we fail to set the state back to PENDING then + // exit Loki as we will effectively be hung anyway stuck in a JOINING state and will + // never join. + ctx := context.Background() + ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) + if err := i.lifecycler.ChangeState(ctx, ring.PENDING); err != nil { + level.Error(logger).Log("msg", "failed to update the ring state back to PENDING after "+ + "a chunk transfer failure, there is nothing more Loki can do from this state "+ + "so the process will exit...", "err", err) os.Exit(1) } + cancel() } }() From 0af311e987cb37db1d695a53806a4dd74987ca70 Mon Sep 17 00:00:00 2001 From: Ed Welch Date: Thu, 9 Jul 2020 15:45:05 -0400 Subject: [PATCH 2/3] Update pkg/ingester/transfer.go Co-authored-by: Cyril Tovena --- pkg/ingester/transfer.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index 16c488effd54..26e5f4ef6dd1 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -63,7 +63,6 @@ func (i *Ingester) TransferChunks(stream logproto.Ingester_TransferChunksServer) // a failed transfer to try again. If we fail to set the state back to PENDING then // exit Loki as we will effectively be hung anyway stuck in a JOINING state and will // never join. - ctx := context.Background() ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) if err := i.lifecycler.ChangeState(ctx, ring.PENDING); err != nil { level.Error(logger).Log("msg", "failed to update the ring state back to PENDING after "+ From fef5dcb0f1ac00864e1d2a216fc945a025edbd38 Mon Sep 17 00:00:00 2001 From: Ed Welch Date: Thu, 9 Jul 2020 15:45:10 -0400 Subject: [PATCH 3/3] Update pkg/ingester/transfer.go Co-authored-by: Cyril Tovena --- pkg/ingester/transfer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index 26e5f4ef6dd1..fc67087d2fc6 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -63,7 +63,7 @@ func (i *Ingester) TransferChunks(stream logproto.Ingester_TransferChunksServer) // a failed transfer to try again. If we fail to set the state back to PENDING then // exit Loki as we will effectively be hung anyway stuck in a JOINING state and will // never join. - ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) if err := i.lifecycler.ChangeState(ctx, ring.PENDING); err != nil { level.Error(logger).Log("msg", "failed to update the ring state back to PENDING after "+ "a chunk transfer failure, there is nothing more Loki can do from this state "+