Skip to content

Commit

Permalink
Add 'updateStrategy' field to RayServiceSpec, allowing users to disab…
Browse files Browse the repository at this point in the history
…le/enable zero-downtime update.
  • Loading branch information
chiayi committed Oct 23, 2024
1 parent 135f129 commit 702d2ba
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 2 deletions.
14 changes: 14 additions & 0 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ _Appears in:_
| `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | |
| `upgradeStrategy` _[UpgradeStrategy](#upgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `zero-downtime` | | |
| `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file<br />Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | |
| `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | |

Expand Down Expand Up @@ -245,6 +246,19 @@ _Appears in:_
| `backoffLimit` _integer_ | BackoffLimit of the submitter k8s job. | | |


#### UpgradeStrategy

_Underlying type:_ _string_





_Appears in:_
- [RayServiceSpec](#rayservicespec)



#### UpscalingMode

_Underlying type:_ _string_
Expand Down
2 changes: 2 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ const (
FailedToUpdateService ServiceStatus = "FailedToUpdateService"
)

type UpgradeStrategy string

const (
BlueGreenUpgrade UpgradeStrategy = "BlueGreenUpgrade"
)

// These statuses should match Ray Serve's application statuses
// See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details.
var ApplicationStatusEnum = struct {
Expand Down Expand Up @@ -57,6 +63,8 @@ type RayServiceSpec struct {
DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"`
// ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics.
ServeService *corev1.Service `json:"serveService,omitempty"`
// UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `zero-downtime`
UpgradeStrategy UpgradeStrategy `json:"upgradeStrategy,omitempty"`
// Important: Run "make" to regenerate code after modifying this file
// Defines the applications and deployments to deploy, should be a YAML multi-line scalar string.
ServeConfigV2 string `json:"serveConfigV2,omitempty"`
Expand Down
2 changes: 2 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,15 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
if clusterAction == RolloutNew {
// For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
// Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME)
rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy
enableZeroDowntime := true
if s := os.Getenv(ENABLE_ZERO_DOWNTIME); strings.ToLower(s) == "false" {
enableZeroDowntime = false
if rayServiceSpecUpgradeStrategy != "" {
enableZeroDowntime = rayServiceSpecUpgradeStrategy == rayv1.BlueGreenUpgrade
} else if zeroDowntimeEnvVar != "" {
enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) == "true"
}

if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil {
// Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will
// actually create the pending RayCluster instance.
Expand Down
16 changes: 16 additions & 0 deletions ray-operator/controllers/ray/rayservice_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,7 @@ func TestReconcileRayCluster(t *testing.T) {
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
zeroDowntimeSpecTrigger bool
}{
// Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set.
"Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": {
Expand Down Expand Up @@ -815,12 +816,27 @@ func TestReconcileRayCluster(t *testing.T) {
updateKubeRayVersion: true,
kubeRayVersion: "new-version",
},
// Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec
"Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
zeroDowntimeSpecTrigger: true,
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
// Enable or disable zero-downtime upgrade.
defer os.Unsetenv(ENABLE_ZERO_DOWNTIME)
if tc.enableZeroDowntime {
if tc.zeroDowntimeSpecTrigger {
rayService.Spec.UpgradeStrategy = rayv1.BlueGreenUpgrade
} else {
os.Setenv(ENABLE_ZERO_DOWNTIME, "true")
}
}
if !tc.enableZeroDowntime {
os.Setenv(ENABLE_ZERO_DOWNTIME, "false")
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 702d2ba

Please sign in to comment.