Skip to content

Commit

Permalink
Add option to keep Ray clusters after a job has finished (#794)
Browse files Browse the repository at this point in the history
* add shutdown delay for ray clusters to gateway

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* fix lint

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* remove delayed remove option

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* lint

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* add label for easy delete

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* I wish you had... more time

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

* debug log

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>

---------

Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>
  • Loading branch information
psschwei authored Jul 19, 2023
1 parent 6a00f6b commit 5513b55
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 0 deletions.
8 changes: 8 additions & 0 deletions gateway/api/management/commands/free_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from api.models import ComputeResource, Job
from api.ray import kill_ray_cluster
from main import settings as config


logger = logging.getLogger("commands")
Expand All @@ -27,6 +28,13 @@ def handle(self, *args, **options):

# only kill cluster if not in local mode and no jobs are running there
if len(alive_jobs) == 0 and not settings.RAY_CLUSTER_MODE.get("local"):
if config.RAY_CLUSTER_NO_DELETE_ON_COMPLETE:
logger.debug(
"RAY_CLUSTER_NO_DELETE_ON_COMPLETE is enabled, "
+ "so cluster [%s] will not be removed",
compute_resource.title,
)
return
kill_ray_cluster(compute_resource.title)
compute_resource.delete()
counter += 1
Expand Down
4 changes: 4 additions & 0 deletions gateway/main/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,7 @@
)

RAY_SETUP_MAX_RETRIES = int(os.environ.get("RAY_SETUP_MAX_RETRIES", 30))

RAY_CLUSTER_NO_DELETE_ON_COMPLETE = bool(
os.environ.get("RAY_CLUSTER_NO_DELETE_ON_COMPLETE", False)
)
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ spec:
value: {{ .Values.application.limits.maxJobsPerUser | quote }}
- name: LIMITS_MAX_CLUSTERS
value: {{ .Values.application.limits.maxComputeResources | quote }}
{{- if .Values.application.limits.keepClusterOnComplete }}
- name: RAY_CLUSTER_NO_DELETE_ON_COMPLETE
value: "True"
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ data:
metadata:
name: {{`{{ cluster_name }}`}}
namespace: {{ .Release.Namespace }}
{{- if .Values.application.limits.keepClusterOnComplete }}
labels:
nodelete: "true"
{{- end }}
spec:
{{- if .Values.application.ray.scrapeWithPrometheus }}
headServiceAnnotations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ application:
limits:
maxJobsPerUser: 2
maxComputeResources: 4
keepClusterOnComplete: False

database:
host: postgresql
Expand Down

0 comments on commit 5513b55

Please sign in to comment.