Skip to content

Commit 44ecfda

Browse files
fix: revisit grove and LWS selection (#2564)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
1 parent 60a6a96 commit 44ecfda

File tree

13 files changed

+67
-39
lines changed

13 files changed

+67
-39
lines changed

deploy/cloud/helm/deploy.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}"
4848
export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
4949
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
5050
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
51-
export ENABLE_LWS="${ENABLE_LWS:=false}"
5251
export DOCKER_REGISTRY_USE_KUBERNETES_SECRET="${DOCKER_REGISTRY_USE_KUBERNETES_SECRET:=false}"
5352

5453
# Add command line options
@@ -167,7 +166,7 @@ echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
167166
echo "INSTALL_CRDS: $INSTALL_CRDS"
168167
echo "DOCKER_REGISTRY_USE_KUBERNETES_SECRET: $DOCKER_REGISTRY_USE_KUBERNETES_SECRET"
169168

170-
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml
169+
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml
171170
echo "generated file contents:"
172171
cat generated-values.yaml
173172

deploy/cloud/helm/dynamo-platform-values.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ dynamo-operator:
2323
- name: ${DOCKER_SECRET_NAME}
2424

2525
dynamo:
26-
enableLWS: ${ENABLE_LWS}
2726
ingress:
2827
enabled: ${INGRESS_ENABLED}
2928
className: ${INGRESS_CLASS}

deploy/cloud/helm/platform/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ maintainers:
1919
url: https://www.nvidia.com
2020
description: A Helm chart for NVIDIA Dynamo Platform.
2121
type: application
22-
version: 0.4.1
22+
version: 0.5.0
2323
home: https://nvidia.com
2424
dependencies:
2525
- name: dynamo-operator
26-
version: 0.4.1
26+
version: 0.5.0
2727
repository: file://components/operator
2828
condition: dynamo-operator.enabled
2929
- name: nats

deploy/cloud/helm/platform/components/operator/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ type: application
2727
# This is the chart version. This version number should be incremented each time you make changes
2828
# to the chart and its templates, including the app version.
2929
# Versions are expected to follow Semantic Versioning (https://semver.org/)
30-
version: 0.4.1
30+
version: 0.5.0
3131
# This is the version number of the application being deployed. This version number should be
3232
# incremented each time you make changes to the application. Versions are not expected to
3333
# follow Semantic Versioning. They should reflect the version the application is using.
3434
# It is recommended to use it with quotes.
35-
appVersion: "0.4.1"
35+
appVersion: "0.5.0"

deploy/cloud/helm/platform/components/operator/templates/deployment.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,6 @@ spec:
101101
{{- if .Values.dynamo.virtualServiceSupportsHTTPS }}
102102
- --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }}
103103
{{- end }}
104-
{{- if .Values.dynamo.enableLWS }}
105-
- --enable-lws
106-
{{- end }}
107104
{{- if .Values.dynamo.groveTerminationDelay }}
108105
- --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
109106
{{- end }}

deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,6 @@ rules:
425425
- patch
426426
- update
427427
- watch
428-
{{- if .Values.dynamo.enableLWS }}
429428
- apiGroups:
430429
- leaderworkerset.x-k8s.io
431430
resources:
@@ -450,7 +449,6 @@ rules:
450449
- patch
451450
- update
452451
- watch
453-
{{- end }}
454452
---
455453
apiVersion: rbac.authorization.k8s.io/v1
456454
{{- if .Values.namespaceRestriction.enabled }}

deploy/cloud/helm/platform/components/operator/values.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ dynamo:
8282
serviceAccount:
8383
annotations: {}
8484

85-
enableLWS: false
8685
groveTerminationDelay: 15m
8786

8887
internalImages:

deploy/cloud/helm/platform/values.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ dynamo-operator:
3434
- --metrics-bind-address=127.0.0.1:8080
3535
imagePullSecrets: []
3636
dynamo:
37-
enableLWS: false
3837
groveTerminationDelay: 15m
3938
internalImages:
4039
debugger: python:3.12-slim

deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,13 @@ func (s *DynamoGraphDeployment) AddStatusCondition(condition metav1.Condition) {
110110
// If no matching condition found, append the new one
111111
s.Status.Conditions = append(s.Status.Conditions, condition)
112112
}
113+
114+
// HasAnyMultinodeService reports whether any service in the graph is configured with more than one node.
115+
func (s *DynamoGraphDeployment) HasAnyMultinodeService() bool {
116+
for _, svc := range s.Spec.Services {
117+
if svc != nil && svc.GetNumberOfNodes() > 1 {
118+
return true
119+
}
120+
}
121+
return false
122+
}

deploy/cloud/operator/cmd/main.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ func main() {
129129
var ingressControllerClassName string
130130
var ingressControllerTLSSecretName string
131131
var ingressHostSuffix string
132-
var enableLWS bool
133132
var groveTerminationDelay time.Duration
134133
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
135134
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -156,8 +155,6 @@ func main() {
156155
"The name of the ingress controller TLS secret to use")
157156
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
158157
"The suffix to use for the ingress host")
159-
flag.BoolVar(&enableLWS, "enable-lws", false,
160-
"If set, enable leader worker set")
161158
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
162159
"The termination delay for Grove PodGangSets")
163160
opts := zap.Options{
@@ -168,11 +165,13 @@ func main() {
168165

169166
ctrlConfig := commonController.Config{
170167
RestrictedNamespace: restrictedNamespace,
171-
EnableLWS: enableLWS,
172168
Grove: commonController.GroveConfig{
173169
Enabled: false, // Will be set after Grove discovery
174170
TerminationDelay: groveTerminationDelay,
175171
},
172+
LWS: commonController.LWSConfig{
173+
Enabled: false, // Will be set after LWS discovery
174+
},
176175
EtcdAddress: etcdAddr,
177176
NatsAddress: natsAddr,
178177
IngressConfig: commonController.IngressConfig{
@@ -240,10 +239,13 @@ func main() {
240239
os.Exit(1)
241240
}
242241

243-
// Detect Grove availability using discovery client
242+
// Detect orchestrators availability using discovery client
244243
setupLog.Info("Detecting Grove availability...")
245244
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
246245
ctrlConfig.Grove.Enabled = groveEnabled
246+
setupLog.Info("Detecting LWS availability...")
247+
lwsEnabled := commonController.DetectLWSAvailability(mainCtx, mgr)
248+
ctrlConfig.LWS.Enabled = lwsEnabled
247249

248250
// Create etcd client
249251
cli, err := clientv3.New(clientv3.Config{

0 commit comments

Comments
 (0)