From 91b2e945cd0e05805211dd3cc705cf6bdf5b36ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Leszczy=C5=84ski?= <2000michal@wp.pl> Date: Thu, 28 Dec 2023 11:11:13 +0100 Subject: [PATCH 1/3] refactor(repair): add job_id to finished job log --- pkg/service/repair/worker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/service/repair/worker.go b/pkg/service/repair/worker.go index cbce83376a..7343122589 100644 --- a/pkg/service/repair/worker.go +++ b/pkg/service/repair/worker.go @@ -49,7 +49,7 @@ func (w *worker) runRepair(ctx context.Context, j job) (out error) { ) // Decorate returned error defer func() { - w.logger.Info(ctx, "Repair done") + w.logger.Info(ctx, "Repair done", "job_id", jobID) // Try to justify error by checking table deletion if out != nil && w.isTableDeleted(ctx, j) { out = errTableDeleted From 5a05eb332b3614a93e10f5a0787e443d5cbfba47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Leszczy=C5=84ski?= <2000michal@wp.pl> Date: Thu, 28 Dec 2023 11:32:31 +0100 Subject: [PATCH 2/3] fix(repair): remove default timeout from RepairStatus Since StorageServiceRepairStatus (without timeout param) returns only when the repair job has finished, we shouldn't time out on our end (even if backoff retry could handle that). This resulted in many backoff errors in SM logs even on successful repair: {"L":"INFO","T":"2023-12-01T01:31:54.398Z","N":"cluster.client","M":"HTTP retry backoff","operation":"StorageServiceRepairStatus","wait":"28.607063257s","error":"after 16m0s: context deadline exceeded","_trace_id":"uqSjtDSfRoOl1WhetiLPgA"} --- pkg/scyllaclient/client_scylla.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/scyllaclient/client_scylla.go b/pkg/scyllaclient/client_scylla.go index 2f48737565..f6a5facc32 100644 --- a/pkg/scyllaclient/client_scylla.go +++ b/pkg/scyllaclient/client_scylla.go @@ -513,9 +513,12 @@ func repairStatusShouldRetryHandler(err error) *bool { return nil } +const repairStatusTimeout = 30 * time.Minute + // RepairStatus waits for repair job to finish and returns its status. func (c *Client) RepairStatus(ctx context.Context, host string, id int32) (CommandStatus, error) { ctx = forceHost(ctx, host) + ctx = customTimeout(ctx, repairStatusTimeout) ctx = withShouldRetryHandler(ctx, repairStatusShouldRetryHandler) var ( resp interface { From 9ffd97777509391eab4c76753d87efa9104db970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Leszczy=C5=84ski?= <2000michal@wp.pl> Date: Thu, 28 Dec 2023 11:35:39 +0100 Subject: [PATCH 3/3] fix(restore): remove default timeout from LoadSSTables Since StorageServiceSstablesByKeyspacePost returns only when load&stream has finished, we shouldn't time out on our end (even if backoff retry could handle that). This resulted in many backoff errors in SM logs even on successful restore: {"L":"INFO","T":"2023-11-30T23:03:07.117Z","N":"cluster.client","M":"HTTP retry backoff","operation":"StorageServiceSstablesByKeyspacePost","wait":"999.175032ms","error":"after 30s: context deadline exceeded","_trace_id":"uqSjtDSfRoOl1WhetiLPgA"} --- pkg/scyllaclient/client_scylla.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/scyllaclient/client_scylla.go b/pkg/scyllaclient/client_scylla.go index f6a5facc32..a172e57067 100644 --- a/pkg/scyllaclient/client_scylla.go +++ b/pkg/scyllaclient/client_scylla.go @@ -899,6 +899,8 @@ func (c *Client) TableDiskSizeReport(ctx context.Context, hostKeyspaceTables Hos return report, err } +const loadSSTablesTimeout = time.Hour + // LoadSSTables that are already downloaded to host's table upload directory. // Used API endpoint has the following properties: // - It is synchronous - response is received only after the loading has finished @@ -909,7 +911,7 @@ func (c *Client) LoadSSTables(ctx context.Context, host, keyspace, table string, const WIPError = "Already loading SSTables" _, err := c.scyllaOps.StorageServiceSstablesByKeyspacePost(&operations.StorageServiceSstablesByKeyspacePostParams{ - Context: forceHost(ctx, host), + Context: customTimeout(forceHost(ctx, host), loadSSTablesTimeout), Keyspace: keyspace, Cf: table, LoadAndStream: &loadAndStream,