Skip to content

Commit

Permalink
Run /start on the background for pods and up the timeout (#503)
Browse files Browse the repository at this point in the history
* Up Kustomize to 5.0.0

* CHANGELOG

* Update golangci-lint to prevent OOM with go1.20

* If old version of golangci-lint is installed, remove it and install a new one

* Fix the TestFailStart to take into account the async nature of start

(cherry picked from commit a5a7908)
  • Loading branch information
burmanm committed Mar 20, 2023
1 parent fc5404a commit ba1db4e
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 29 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti

## unreleased

* [ENHANCEMENT] [#500](https://github.com/k8ssandra/cass-operator/issues/500) Allow the /start command to run for a longer period of time (up to 10 minutes), before killing the pod if no response is received. This is intermediate solution until we can correctly detect from the pod that the start is not proceeding correctly.
* [BUGFIX] [#444](https://github.com/k8ssandra/cass-operator/issues/444) Update cass-config-builder to 1.0.5.
* [BUGFIX] [#415](https://github.com/k8ssandra/cass-operator/issues/415) Fix version override + imageRegistry issue where output would be invalid
* [BUGFIX] [#437](https://github.com/k8ssandra/cass-operator/issues/437) Ignore cluster healthy check on Datacenter decommission. Rest of #437 fix is not applied since this version does not have that bug.
Expand Down
2 changes: 1 addition & 1 deletion pkg/httphelper/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ func (client *NodeMgmtClient) CallLifecycleStartEndpointWithReplaceIp(pod *corev
endpoint: endpoint,
host: podIP,
method: http.MethodPost,
timeout: 10 * time.Second,
timeout: 10 * time.Minute,
}

_, err := callNodeMgmtEndpoint(client, request, "")
Expand Down
52 changes: 26 additions & 26 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -1804,7 +1804,6 @@ func (rc *ReconciliationContext) findStartedNotReadyNodes() (bool, error) {

func (rc *ReconciliationContext) startCassandra(endpointData httphelper.CassMetadataEndpoints, pod *corev1.Pod) error {
dc := rc.Datacenter
mgmtClient := rc.NodeMgmtClient

// Are we replacing this node?
shouldReplacePod := utils.IndexOfString(dc.Status.NodeReplacements, pod.Name) > -1
Expand All @@ -1829,33 +1828,34 @@ func (rc *ReconciliationContext) startCassandra(endpointData httphelper.CassMeta
}
}

var err error

if shouldReplacePod && replaceAddress != "" {
// If we have a replace address that means the cassandra node did
// join the ring previously and is marked for replacement, so we
// start it accordingly
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandraAndReplacingNode,
"Starting Cassandra for pod %s to replace Cassandra node with address %s", pod.Name, replaceAddress)
err = mgmtClient.CallLifecycleStartEndpointWithReplaceIp(pod, replaceAddress)
} else {
// Either we are not replacing this pod or the relevant cassandra node
// never joined the ring in the first place and can be started normally
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandra,
"Starting Cassandra for pod %s", pod.Name)
err = mgmtClient.CallLifecycleStartEndpoint(pod)
}
go func(pod *corev1.Pod) {
var err error
if shouldReplacePod && replaceAddress != "" {
// If we have a replace address that means the cassandra node did
// join the ring previously and is marked for replacement, so we
// start it accordingly
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandraAndReplacingNode,
"Starting Cassandra for pod %s to replace Cassandra node with address %s", pod.Name, replaceAddress)
err = rc.NodeMgmtClient.CallLifecycleStartEndpointWithReplaceIp(pod, replaceAddress)
} else {
// Either we are not replacing this pod or the relevant cassandra node
// never joined the ring in the first place and can be started normally
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandra,
"Starting Cassandra for pod %s", pod.Name)
err = rc.NodeMgmtClient.CallLifecycleStartEndpoint(pod)
}

if err != nil {
// Pod was unable to start. Most likely this is not a recoverable error, so lets kill the pod and
// try again.
if deleteErr := rc.Client.Delete(rc.Ctx, pod); deleteErr != nil {
return deleteErr
if err != nil {
// Pod was unable to start. Most likely this is not a recoverable error, so lets kill the pod and
// try again.
if deleteErr := rc.Client.Delete(rc.Ctx, pod); deleteErr != nil {
rc.ReqLogger.Error(err, "Unable to delete the pod, pod has failed to start", "Pod", pod.Name)
}
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeWarning, events.StartingCassandra,
"Failed to start pod %s, deleting it", pod.Name)
}
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeWarning, events.StartingCassandra,
"Failed to start pod %s, deleting it", pod.Name)
return err
}
}(pod)

return rc.labelServerPodStarting(pod)
}

Expand Down
17 changes: 15 additions & 2 deletions pkg/reconciliation/reconcile_racks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1636,7 +1636,13 @@ func TestFailedStart(t *testing.T) {
mockClient := &mocks.Client{}
rc.Client = mockClient

k8sMockClientDelete(mockClient, nil).Times(1)
done := make(chan struct{})
k8sMockClientDelete(mockClient, nil).Once().Run(func(mock.Arguments) { close(done) })

// Patch labelStarting, lastNodeStarted..
k8sMockClientPatch(mockClient, nil).Once()
k8sMockClientStatus(mockClient, mockClient).Once()
k8sMockClientPatch(mockClient, nil).Once()

res := &http.Response{
StatusCode: http.StatusInternalServerError,
Expand Down Expand Up @@ -1670,7 +1676,14 @@ func TestFailedStart(t *testing.T) {
rc.Recorder = fakeRecorder

err := rc.startCassandra(epData, pod)
assert.Error(t, err)
// The start is async method, so the error is not returned here
assert.Nil(t, err)

select {
case <-done:
case <-time.After(2 * time.Second):
assert.Fail(t, "No pod delete occurred")
}

close(fakeRecorder.Events)
// Should have 2 events, one to indicate Cassandra is starting, one to indicate it failed to start
Expand Down

0 comments on commit ba1db4e

Please sign in to comment.