Skip to content

Commit

Permalink
Add 'updateStrategy' field to RayServiceSpec, allowing users to disab…
Browse files Browse the repository at this point in the history
…le/enable zero-downtime update.
  • Loading branch information
chiayi committed Nov 7, 2024
1 parent 5cb2f56 commit 932941b
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 10 deletions.
14 changes: 14 additions & 0 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,26 @@ _Appears in:_
| `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | |
| `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None` | NewCluster | |
| `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file<br />Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | |
| `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | |




#### RayServiceUpgradeStrategy

_Underlying type:_ _string_





_Appears in:_
- [RayServiceSpec](#rayservicespec)



#### ScaleStrategy


Expand Down
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ const (
FailedToUpdateService ServiceStatus = "FailedToUpdateService"
)

type RayServiceUpgradeStrategy string

const (
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeStrategy = "NewCluster"
// Having None as the strategy will mean down-time while it wait for new cluster to be ready when performing upgrade
None RayServiceUpgradeStrategy = "None"
)

// These statuses should match Ray Serve's application statuses
// See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details.
var ApplicationStatusEnum = struct {
Expand Down Expand Up @@ -57,6 +66,9 @@ type RayServiceSpec struct {
DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"`
// ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics.
ServeService *corev1.Service `json:"serveService,omitempty"`
// UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`
// +kubebuilder:default:=NewCluster
UpgradeStrategy RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"`
// Important: Run "make" to regenerate code after modifying this file
// Defines the applications and deployments to deploy, should be a YAML multi-line scalar string.
ServeConfigV2 string `json:"serveConfigV2,omitempty"`
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 17 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ func validateRayServiceSpec(rayService *rayv1.RayService) error {
if headSvc := rayService.Spec.RayClusterSpec.HeadGroupSpec.HeadService; headSvc != nil && headSvc.Name != "" {
return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set")
}

if upgradeStrategy := rayService.Spec.UpgradeStrategy; upgradeStrategy != "" {
if upgradeStrategy != rayv1.NewCluster && upgradeStrategy != rayv1.None {
return fmt.Errorf("spec.UpgradeStrategy is invalid, valid options are %s and %s", rayv1.NewCluster, rayv1.None)
}
}
return nil
}

Expand Down Expand Up @@ -424,10 +430,19 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
if clusterAction == RolloutNew {
// For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
// Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME)
rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy
// There are two ways to enable zero downtime upgrade. Through ENABLE_ZERO_DOWNTIME env var or setting Spec.UpgradeStrategy.
// If no fields are set, zero downtime upgrade by default is enabled.
// Spec.UpgradeStrategy takes precedence over ENABLE_ZERO_DOWNTIME.
enableZeroDowntime := true
if s := os.Getenv(ENABLE_ZERO_DOWNTIME); strings.ToLower(s) == "false" {
enableZeroDowntime = false
if zeroDowntimeEnvVar != "" {
enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) != "false"
}
if rayServiceSpecUpgradeStrategy != "" {
enableZeroDowntime = rayServiceSpecUpgradeStrategy == rayv1.NewCluster
}

if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil {
// Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will
// actually create the pending RayCluster instance.
Expand Down
58 changes: 50 additions & 8 deletions ray-operator/controllers/ray/rayservice_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ func TestValidateRayServiceSpec(t *testing.T) {
Spec: rayv1.RayServiceSpec{},
})
assert.NoError(t, err, "The RayService spec is valid.")

err = validateRayServiceSpec(&rayv1.RayService{
Spec: rayv1.RayServiceSpec{
UpgradeStrategy: "invalidStrategy",
},
})
assert.Error(t, err, "spec.UpgradeStrategy is invalid")
}

func TestGenerateHashWithoutReplicasAndWorkersToDelete(t *testing.T) {
Expand Down Expand Up @@ -762,12 +769,13 @@ func TestReconcileRayCluster(t *testing.T) {
}

tests := map[string]struct {
activeCluster *rayv1.RayCluster
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
activeCluster *rayv1.RayCluster
rayServiceUpgradeStrategy rayv1.RayServiceUpgradeStrategy
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
}{
// Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set.
"Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": {
Expand All @@ -790,8 +798,8 @@ func TestReconcileRayCluster(t *testing.T) {
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
},
// Test 4: The active cluster exists. Trigger the zero-downtime upgrade.
"Zero-downtime upgrade is disabled. The active cluster exists. Trigger the zero-downtime upgrade.": {
// Test 4: The active cluster exists. Zero-downtime upgrade is false, should not trigger zero-downtime upgrade.
"Zero-downtime upgrade is disabled. The active cluster exists. Does not trigger the zero-downtime upgrade.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
Expand All @@ -815,6 +823,37 @@ func TestReconcileRayCluster(t *testing.T) {
updateKubeRayVersion: true,
kubeRayVersion: "new-version",
},
// Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec
"Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: rayv1.NewCluster,
},
// Test 8: Zero downtime upgrade is enabled. Env var is set to false but RayServiceSpec is set to NewCluster. Trigger the zero-downtime upgrade.
"Zero-downtime upgrade is enabled through RayServiceSpec and not through env var. Active cluster exist. Trigger the zero-downtime upgrade.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: rayv1.NewCluster,
},
// Test 9: Zero downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None.
"Zero-downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: false,
rayServiceUpgradeStrategy: rayv1.None,
},
// Test 10: Zero downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set. Trigger the zero-downtime upgrade.
"Zero-downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set.": {
activeCluster: nil,
updateRayClusterSpec: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: "",
},
}

for name, tc := range tests {
Expand All @@ -824,6 +863,9 @@ func TestReconcileRayCluster(t *testing.T) {
if !tc.enableZeroDowntime {
os.Setenv(ENABLE_ZERO_DOWNTIME, "false")
}
if tc.rayServiceUpgradeStrategy != "" {
rayService.Spec.UpgradeStrategy = tc.rayServiceUpgradeStrategy
}
runtimeObjects := []runtime.Object{}
if tc.activeCluster != nil {
// Update 'ray.io/kuberay-version' to a new version if kubeRayVersion is set.
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 932941b

Please sign in to comment.