Skip to content

Commit 2f26328

Browse files
fix(ray): job long running span could be never finished [backport 3.16] (#14902)
Backport a75ed3e from #14871 to 3.16. If a job was failing during job_submission, the long_running_job span was not finished causing the span to not leave driveline and creating inconsistent behavior on the UI. Co-authored-by: Louis Tricot <75956635+dubloom@users.noreply.github.com>
1 parent 09a1427 commit 2f26328

File tree

3 files changed

+18
-14
lines changed

3 files changed

+18
-14
lines changed

ddtrace/contrib/internal/ray/patch.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -208,22 +208,22 @@ def traced_submit_job(wrapped, instance, args, kwargs):
208208
else:
209209
job_name = DEFAULT_JOB_NAME
210210

211-
# Root span creation
212211
job_span = tracer.start_span("ray.job", service=job_name or DEFAULT_JOB_NAME, span_type=SpanTypes.RAY)
213-
_inject_ray_span_tags_and_metrics(job_span)
214-
job_span.set_tag_str(RAY_SUBMISSION_ID_TAG, submission_id)
215-
if entrypoint:
216-
job_span.set_tag_str(RAY_ENTRYPOINT, entrypoint)
212+
try:
213+
# Root span creation
214+
_inject_ray_span_tags_and_metrics(job_span)
215+
job_span.set_tag_str(RAY_SUBMISSION_ID_TAG, submission_id)
216+
if entrypoint:
217+
job_span.set_tag_str(RAY_ENTRYPOINT, entrypoint)
217218

218-
metadata = kwargs.get("metadata", {})
219-
dot_paths = flatten_metadata_dict(metadata)
220-
for k, v in dot_paths.items():
221-
set_tag_or_truncate(job_span, k, v)
219+
metadata = kwargs.get("metadata", {})
220+
dot_paths = flatten_metadata_dict(metadata)
221+
for k, v in dot_paths.items():
222+
set_tag_or_truncate(job_span, k, v)
222223

223-
tracer.context_provider.activate(job_span)
224-
start_long_running_job(job_span)
224+
tracer.context_provider.activate(job_span)
225+
start_long_running_job(job_span)
225226

226-
try:
227227
with tracer.trace(
228228
"ray.job.submit", service=job_name or DEFAULT_JOB_NAME, span_type=SpanTypes.RAY
229229
) as submit_span:
@@ -254,7 +254,7 @@ def traced_submit_job(wrapped, instance, args, kwargs):
254254
job_span.set_tag_str(RAY_JOB_STATUS, RAY_STATUS_ERROR)
255255
job_span.error = 1
256256
job_span.set_exc_info(type(e), e, e.__traceback__)
257-
job_span.finish()
257+
stop_long_running_job(submission_id)
258258
raise e
259259

260260

ddtrace/contrib/internal/ray/span_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def start_long_running_job(job_span: Span) -> None:
269269
start_long_running_span(job_span)
270270

271271

272-
def stop_long_running_job(submission_id: str, job_info: Optional[JobInfo]) -> None:
272+
def stop_long_running_job(submission_id: str, job_info: Optional[JobInfo] = None) -> None:
273273
get_span_manager().stop_long_running_job(submission_id, job_info)
274274

275275

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
ray: This fix resolves an issue where long-running job spans could remain unfinished when an exception occurred during job submission.

0 commit comments

Comments
 (0)