Skip to content

Commit

Permalink
Add 'updateStrategy' field to RayServiceSpec, allowing users to disab…
Browse files Browse the repository at this point in the history
…le/enable zero-downtime update.
  • Loading branch information
chiayi committed Nov 4, 2024
1 parent 5cb2f56 commit 4a531cb
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 8 deletions.
14 changes: 14 additions & 0 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,26 @@ _Appears in:_
| `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | |
| `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None` | NewCluster | |
| `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file<br />Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | |
| `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | |




#### RayServiceUpgradeStrategy

_Underlying type:_ _string_





_Appears in:_
- [RayServiceSpec](#rayservicespec)



#### ScaleStrategy


Expand Down
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ const (
FailedToUpdateService ServiceStatus = "FailedToUpdateService"
)

type RayServiceUpgradeStrategy string

const (
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeStrategy = "NewCluster"
// Having None as the strategy will mean down-time while it wait for new cluster to be ready when performing upgrade
None RayServiceUpgradeStrategy = "None"
)

// These statuses should match Ray Serve's application statuses
// See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details.
var ApplicationStatusEnum = struct {
Expand Down Expand Up @@ -57,6 +66,9 @@ type RayServiceSpec struct {
DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"`
// ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics.
ServeService *corev1.Service `json:"serveService,omitempty"`
// UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`
// +kubebuilder:default:=NewCluster
UpgradeStrategy RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"`
// Important: Run "make" to regenerate code after modifying this file
// Defines the applications and deployments to deploy, should be a YAML multi-line scalar string.
ServeConfigV2 string `json:"serveConfigV2,omitempty"`
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,15 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
if clusterAction == RolloutNew {
// For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
// Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME)
rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy
enableZeroDowntime := true
if s := os.Getenv(ENABLE_ZERO_DOWNTIME); strings.ToLower(s) == "false" {
enableZeroDowntime = false
if zeroDowntimeEnvVar != "" {
enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) != "false"
} else if rayServiceSpecUpgradeStrategy != "" {
enableZeroDowntime = rayServiceSpecUpgradeStrategy == rayv1.NewCluster
}

if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil {
// Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will
// actually create the pending RayCluster instance.
Expand Down
27 changes: 21 additions & 6 deletions ray-operator/controllers/ray/rayservice_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -762,12 +762,13 @@ func TestReconcileRayCluster(t *testing.T) {
}

tests := map[string]struct {
activeCluster *rayv1.RayCluster
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
activeCluster *rayv1.RayCluster
rayServiceUpgradeStrategy rayv1.RayServiceUpgradeStrategy
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
}{
// Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set.
"Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": {
Expand Down Expand Up @@ -815,6 +816,14 @@ func TestReconcileRayCluster(t *testing.T) {
updateKubeRayVersion: true,
kubeRayVersion: "new-version",
},
// Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec
"Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: rayv1.NewCluster,
},
}

for name, tc := range tests {
Expand All @@ -824,6 +833,12 @@ func TestReconcileRayCluster(t *testing.T) {
if !tc.enableZeroDowntime {
os.Setenv(ENABLE_ZERO_DOWNTIME, "false")
}
// Defaulting for ray service spec as it would from a real rayservice
if tc.rayServiceUpgradeStrategy == "" {
rayService.Spec.UpgradeStrategy = rayv1.NewCluster
} else {
rayService.Spec.UpgradeStrategy = tc.rayServiceUpgradeStrategy
}
runtimeObjects := []runtime.Object{}
if tc.activeCluster != nil {
// Update 'ray.io/kuberay-version' to a new version if kubeRayVersion is set.
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 4a531cb

Please sign in to comment.