Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

requeue after the last node has its node-state label set to Started during cluster creation #77

Merged
merged 5 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions mage/ginkgo/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,25 @@ func (ns *NsWrapper) WaitForDatacenterOperatorProgress(dcName string, progressVa
ns.WaitForOutputAndLog(step, k, progressValue, timeout)
}

func (ns *NsWrapper) WaitForSuperUserUpserted(dcName string, timeout int) {
json := "jsonpath={.status.superUserUpserted}"
k := kubectl.Get("CassandraDatacenter", dcName).
FormatOutput(json)
execErr := ns.WaitForOutputPattern(k, `\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z`, timeout)
Expect(execErr).ToNot(HaveOccurred())
}

func (ns *NsWrapper) GetNodeStatusesHostIds(dcName string) []string {
json := "jsonpath={.status.nodeStatuses['*'].hostID}"
k := kubectl.Get("CassandraDatacenter", dcName).
FormatOutput(json)

output := ns.OutputPanic(k)
hostIds := strings.Split(output, " ")

return hostIds
}

func (ns *NsWrapper) WaitForDatacenterReadyPodCount(dcName string, count int) {
timeout := count * 400
step := "waiting for the node to become ready"
Expand Down
21 changes: 16 additions & 5 deletions operator/pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta

// step 1 - see if any nodes are already coming up

nodeIsStarting, err := rc.findStartingNodes()
nodeIsStarting, nodeStarted, err := rc.findStartingNodes()

if err != nil {
return result.Error(err)
Expand Down Expand Up @@ -476,6 +476,9 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta
desiredSize := int(rc.Datacenter.Spec.Size)

if desiredSize == readyPodCount && desiredSize == startedLabelCount {
if nodeStarted {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should add a one line comment here

also why not move this directly under

	nodeIsStarting, nodeStarted, err := rc.findStartingNodes()

?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment added.

I hadn't really considered moving the if block. That will require some further discussion to ensure I don't wind up introducing some regressions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah let's not introduce instability, I can tackle this when refactoring some stuff in an upcoming PR

return result.RequeueSoon(2)
}
return result.Continue()
} else {
err := fmt.Errorf("checks failed desired:%d, ready:%d, started:%d", desiredSize, readyPodCount, startedLabelCount)
Expand Down Expand Up @@ -1332,7 +1335,13 @@ func (rc *ReconciliationContext) callNodeManagementStart(pod *corev1.Pod) error
return err
}

func (rc *ReconciliationContext) findStartingNodes() (bool, error) {
// Checks to see if any node is starting. This is done by checking to see if the cassandra.datastax.com/node-state label
// has a value of Starting. If it does then check to see if the C* node is ready. If the node is ready, the pod's
// cassandra.datastax.com/node-state label is set to a value of Started. This function returns two bools and an error.
// The first bool is true if there is a C* node that is Starting. The second bool is set to true if a C* node has just
// transitioned to the Started state by having its cassandra.datastax.com/node-state label set to Started. The error is
// non-nil if updating the pod's labels fails.
func (rc *ReconciliationContext) findStartingNodes() (bool, bool, error) {
rc.ReqLogger.Info("reconcile_racks::findStartingNodes")

for _, pod := range rc.clusterPods {
Expand All @@ -1341,7 +1350,9 @@ func (rc *ReconciliationContext) findStartingNodes() (bool, error) {
rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartedCassandra,
"Started Cassandra for pod %s", pod.Name)
if err := rc.labelServerPodStarted(pod); err != nil {
return false, err
return false, false, err
} else {
return false, true, nil
}
} else {
// TODO Calling start again on the pod seemed like a good defensive practice
Expand All @@ -1351,11 +1362,11 @@ func (rc *ReconciliationContext) findStartingNodes() (bool, error) {
// if err := rc.callNodeManagementStart(pod); err != nil {
// return false, err
// }
return true, nil
return true, false, nil
}
}
}
return false, nil
return false, false, nil
}

func (rc *ReconciliationContext) findStartedNotReadyNodes() (bool, error) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package scale_up_status_updates

import (
"fmt"
"testing"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"

ginkgo_util "github.com/datastax/cass-operator/mage/ginkgo"
"github.com/datastax/cass-operator/mage/kubectl"
)

var (
testName = "Scale up status updates"
namespace = "test-scale-up-status-updates"
dcName = "dc1"
dcYaml = "../testdata/oss-two-rack-six-node-dc.yaml"
operatorYaml = "../testdata/operator.yaml"
dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName)
dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName)
ns = ginkgo_util.NewWrapper(testName, namespace)
)

func TestLifecycle(t *testing.T) {
AfterSuite(func() {
logPath := fmt.Sprintf("%s/aftersuite", ns.LogDir)
kubectl.DumpAllLogs(logPath).ExecV()
fmt.Printf("\n\tPost-run logs dumped at: %s\n\n", logPath)
ns.Terminate()
})

RegisterFailHandler(Fail)
RunSpecs(t, testName)
}

var _ = Describe(testName, func() {
Context("when in a new cluster", func() {
Specify("the operator can scale up a datacenter and does not upsert the super user until all nodes have been started", func() {
By("creating a namespace")
err := kubectl.CreateNamespace(namespace).ExecV()
Expect(err).ToNot(HaveOccurred())

step := "setting up cass-operator resources via helm chart"
ns.HelmInstall("../../charts/cass-operator-chart")

ns.WaitForOperatorReady()

step = "creating a datacenter resource with 2 racks/6 nodes"
k := kubectl.ApplyFiles(dcYaml)
ns.ExecAndLog(step, k)

ns.WaitForSuperUserUpserted(dcName, 600)

step = "checking that all nodes have been started"
nodeStatusesHostIds := ns.GetNodeStatusesHostIds(dcName)
Expect(len(nodeStatusesHostIds), 6)

step = "deleting the dc"
k = kubectl.DeleteFromFiles(dcYaml)
ns.ExecAndLog(step, k)

step = "checking that the dc no longer exists"
json := "jsonpath={.items}"
k = kubectl.Get("CassandraDatacenter").
WithLabel(dcLabel).
FormatOutput(json)
ns.WaitForOutputAndLog(step, k, "[]", 300)
})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of our tests will typically test that the dc can be successfully deleted at the end of the test scenario. It probably isn't super important to have on every single one (especially since you really aren't doing any crazy patching or unusual things) so I'll let you decide if you want to add the logic to this test or not. See our scale up test for reference if you choose to (again, purely optional from my perspective).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add the deletion check(s). I would rather be consistent with other tests.

})
})
28 changes: 28 additions & 0 deletions tests/testdata/oss-two-rack-six-node-dc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: cassandra.datastax.com/v1beta1
kind: CassandraDatacenter
metadata:
name: dc1
spec:
clusterName: cluster1
serverType: cassandra
serverVersion: "3.11.6"
serverImage: datastax/cassandra-mgmtapi-3_11_6:v0.1.0
configBuilderImage: datastax/cass-config-builder:1.0.0
managementApiAuth:
insecure: {}
size: 6
storageConfig:
cassandraDataVolumeClaimSpec:
storageClassName: server-storage
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
racks:
- name: r1
- name: r2
config:
jvm-options:
initial_heap_size: "800m"
max_heap_size: "800m"