From 62bccfc9e55ee9732ff23e97145b87c700ce7c28 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 10 Sep 2025 22:08:56 -0700 Subject: [PATCH 01/20] up Signed-off-by: Jiajun Yao --- BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD.bazel b/BUILD.bazel index c30d1b1f43ae..aacf4f1ef10c 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,4 +1,4 @@ -# Bazel build +# Bazel build test # C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html # ************************** IMPORTANT *********************** From 83dad276efdd1441333714fe35755c1e22ffeb73 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 15 Sep 2025 12:19:17 -0700 Subject: [PATCH 02/20] debug Signed-off-by: Jiajun Yao --- python/ray/actor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/ray/actor.py b/python/ray/actor.py index 07f808c90352..2cbc1bea6558 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -1152,6 +1152,10 @@ def _process_option_dict(actor_options, has_tensor_transport_methods): # enable tensor transport, unless it was explicitly set to False by the # user. if has_tensor_transport_methods: + import os + + print("jjyaooooooo cannot happen") + os._exit(1) if _filled_options["enable_tensor_transport"] is False: raise ValueError( "Actor class has methods with @ray.method(tensor_transport=...) decorator but @ray.remote(enable_tensor_transport=False). " @@ -1170,6 +1174,10 @@ def _process_option_dict(actor_options, has_tensor_transport_methods): # https://github.com/ray-project/ray/issues/54639 is fixed. enable_tensor_transport = _filled_options.get("enable_tensor_transport", False) if enable_tensor_transport: + import os + + print("jjyaooooooo cannot happen") + os._exit(1) if _filled_options.get("concurrency_groups", None) is None: _filled_options["concurrency_groups"] = {} _filled_options["concurrency_groups"]["_ray_system"] = 1 From a36edd0e2a927534b12e5091d792d18781b06145 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 15 Sep 2025 15:13:14 -0700 Subject: [PATCH 03/20] debug Signed-off-by: Jiajun Yao --- ci/build/build-manylinux-wheel.sh | 2 ++ python/ray/_private/utils.py | 7 +++++++ python/ray/_raylet.pyx | 2 ++ 3 files changed, 11 insertions(+) diff --git a/ci/build/build-manylinux-wheel.sh b/ci/build/build-manylinux-wheel.sh index b2b1abdadaa7..fa1de1ef9add 100755 --- a/ci/build/build-manylinux-wheel.sh +++ b/ci/build/build-manylinux-wheel.sh @@ -26,6 +26,8 @@ export BAZEL_PATH="$HOME"/bin/bazel # This is required for building with bazel. sudo ln -sf "/opt/python/${PYTHON}/bin/python3" /usr/local/bin/python3 +export RAY_DEBUG_BUILD=debug + # build ray wheel PATH="/opt/python/${PYTHON}/bin:$PATH" RAY_INSTALL_JAVA=0 \ "/opt/python/${PYTHON}/bin/python" -m pip wheel -v -w dist . --no-deps diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index 7793787e488f..ac0389795d8e 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -235,9 +235,11 @@ def set_omp_num_threads_if_unset() -> bool: Returns True if OMP_NUM_THREADS is set in this function. """ + print("set_omp_num_threads_if_unset start") num_threads_from_env = os.environ.get("OMP_NUM_THREADS") if num_threads_from_env is not None: # No ops if it's set + print("set_omp_num_threads_if_unset end") return False # If unset, try setting the correct CPU count assigned. @@ -263,6 +265,7 @@ def set_omp_num_threads_if_unset() -> bool: # For num_cpus >= 1: Set to the floor of the actual assigned cpus. omp_num_threads = max(math.floor(num_assigned_cpus), 1) os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) + print("set_omp_num_threads_if_unset end") return True @@ -274,6 +277,7 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: """ from ray._private.ray_constants import env_bool + print("set_visible_accelerator_ids start") original_visible_accelerator_env_vars = {} override_on_zero = env_bool( ray._private.accelerators.RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, @@ -293,6 +297,7 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: resource_name ).set_current_process_visible_accelerator_ids(accelerator_ids) + print("set_visible_accelerator_ids end") return original_visible_accelerator_env_vars @@ -300,11 +305,13 @@ def reset_visible_accelerator_env_vars( original_visible_accelerator_env_vars: Mapping[str, Optional[str]] ) -> None: """Reset the visible accelerator env vars to the original values.""" + print("reset_visible_accelerator_env_vars start") for env_var, env_value in original_visible_accelerator_env_vars.items(): if env_value is None: os.environ.pop(env_var, None) else: os.environ[env_var] = env_value + print("reset_visible_accelerator_env_vars end") class Unbuffered(object): diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 995a417b472f..29c4aa67d61f 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2311,7 +2311,9 @@ cdef execute_task_with_cancellation_handler( ray._private.utils.reset_visible_accelerator_env_vars(original_visible_accelerator_env_vars) if omp_num_threads_overriden: # Reset the OMP_NUM_THREADS environ if it was set. + print("pop omp start") os.environ.pop("OMP_NUM_THREADS", None) + print("pop omp end") if execution_info.max_calls != 0: From a9564d34a4dbae7f4930724d374e9a5a1bbb4d42 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 15 Sep 2025 16:47:48 -0700 Subject: [PATCH 04/20] up Signed-off-by: Jiajun Yao --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 60024b4b8c1f..671ea0a3ea5a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -83,7 +83,7 @@ def __init__( version = find_version("ray", "_version.py") # add .dbg suffix if debug mode is on. if build_type == BuildType.DEBUG: - self.version: str = f"{version}+dbg" + self.version: str = f"{version}" elif build_type == BuildType.ASAN: self.version: str = f"{version}+asan" elif build_type == BuildType.TSAN: From 6520d39b5f17dcf76128801cac6675cb41d0ae61 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 17 Sep 2025 08:42:59 -0700 Subject: [PATCH 05/20] up Signed-off-by: Jiajun Yao --- ci/build/build-manylinux-wheel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build/build-manylinux-wheel.sh b/ci/build/build-manylinux-wheel.sh index fa1de1ef9add..8a067152c0b5 100755 --- a/ci/build/build-manylinux-wheel.sh +++ b/ci/build/build-manylinux-wheel.sh @@ -26,7 +26,7 @@ export BAZEL_PATH="$HOME"/bin/bazel # This is required for building with bazel. sudo ln -sf "/opt/python/${PYTHON}/bin/python3" /usr/local/bin/python3 -export RAY_DEBUG_BUILD=debug +# export RAY_DEBUG_BUILD=debug # build ray wheel PATH="/opt/python/${PYTHON}/bin:$PATH" RAY_INSTALL_JAVA=0 \ From 89ffd57bc7cd0692d897f811ce9800f0865c1832 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 17 Sep 2025 11:26:43 -0700 Subject: [PATCH 06/20] up Signed-off-by: Jiajun Yao --- python/ray/_private/utils.py | 14 +++++++------- python/ray/_raylet.pyx | 5 +++-- src/ray/core_worker/core_worker.cc | 1 + 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index ac0389795d8e..5b52a66ad9f7 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -235,11 +235,11 @@ def set_omp_num_threads_if_unset() -> bool: Returns True if OMP_NUM_THREADS is set in this function. """ - print("set_omp_num_threads_if_unset start") + print("set_omp_num_threads_if_unset start", file=sys.stderr) num_threads_from_env = os.environ.get("OMP_NUM_THREADS") if num_threads_from_env is not None: # No ops if it's set - print("set_omp_num_threads_if_unset end") + print("set_omp_num_threads_if_unset end", file=sys.stderr) return False # If unset, try setting the correct CPU count assigned. @@ -265,7 +265,7 @@ def set_omp_num_threads_if_unset() -> bool: # For num_cpus >= 1: Set to the floor of the actual assigned cpus. omp_num_threads = max(math.floor(num_assigned_cpus), 1) os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) - print("set_omp_num_threads_if_unset end") + print("set_omp_num_threads_if_unset end", file=sys.stderr) return True @@ -277,7 +277,7 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: """ from ray._private.ray_constants import env_bool - print("set_visible_accelerator_ids start") + print("set_visible_accelerator_ids start", file=sys.stderr) original_visible_accelerator_env_vars = {} override_on_zero = env_bool( ray._private.accelerators.RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, @@ -297,7 +297,7 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: resource_name ).set_current_process_visible_accelerator_ids(accelerator_ids) - print("set_visible_accelerator_ids end") + print("set_visible_accelerator_ids end", file=sys.stderr) return original_visible_accelerator_env_vars @@ -305,13 +305,13 @@ def reset_visible_accelerator_env_vars( original_visible_accelerator_env_vars: Mapping[str, Optional[str]] ) -> None: """Reset the visible accelerator env vars to the original values.""" - print("reset_visible_accelerator_env_vars start") + print("reset_visible_accelerator_env_vars start", file=sys.stderr) for env_var, env_value in original_visible_accelerator_env_vars.items(): if env_value is None: os.environ.pop(env_var, None) else: os.environ[env_var] = env_value - print("reset_visible_accelerator_env_vars end") + print("reset_visible_accelerator_env_vars end", file=sys.stderr) class Unbuffered(object): diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 29c4aa67d61f..ef7ff2791235 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2211,6 +2211,7 @@ cdef execute_task_with_cancellation_handler( # in case executing the main task throws an exception. function_descriptor = CFunctionDescriptorToPython( ray_function.GetFunctionDescriptor()) + print("runnning", file=sys.stderr) if task_type == TASK_TYPE_ACTOR_CREATION_TASK: actor_class = manager.load_actor_class(job_id, function_descriptor) actor_id = core_worker.get_actor_id() @@ -2311,9 +2312,9 @@ cdef execute_task_with_cancellation_handler( ray._private.utils.reset_visible_accelerator_env_vars(original_visible_accelerator_env_vars) if omp_num_threads_overriden: # Reset the OMP_NUM_THREADS environ if it was set. - print("pop omp start") + print("pop omp start", file=sys.stderr) os.environ.pop("OMP_NUM_THREADS", None) - print("pop omp end") + print("pop omp end", file=sys.stderr) if execution_info.max_calls != 0: diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 62ef51ae0bdc..aae9ec579bc9 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -2854,6 +2854,7 @@ Status CoreWorker::ExecuteTask( /*generator_backpressure_num_objects=*/ task_spec.GeneratorBackpressureNumObjects(), /*tensor_transport=*/task_spec.TensorTransport()); + RAY_LOG(INFO) << "Task execution done"; // Get the reference counts for any IDs that we borrowed during this task, // remove the local reference for these IDs, and return the ref count info to From 9f6fec35c1250265e34f8d6de0c5c50fef12ed7f Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 17 Sep 2025 13:02:46 -0700 Subject: [PATCH 07/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index ef7ff2791235..3f63a8c7fa41 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2211,12 +2211,14 @@ cdef execute_task_with_cancellation_handler( # in case executing the main task throws an exception. function_descriptor = CFunctionDescriptorToPython( ray_function.GetFunctionDescriptor()) - print("runnning", file=sys.stderr) + print("runnning 1", file=sys.stderr) if task_type == TASK_TYPE_ACTOR_CREATION_TASK: + print("runnning 2", file=sys.stderr) actor_class = manager.load_actor_class(job_id, function_descriptor) actor_id = core_worker.get_actor_id() actor = actor_class.__new__(actor_class) worker.actors[actor_id] = actor + print("runnning 3", file=sys.stderr) # Record the actor class via :actor_name: magic token in the log. # @@ -2229,12 +2231,14 @@ cdef execute_task_with_cancellation_handler( # Flush to both .out and .err print(actor_magic_token, end="") print(actor_magic_token, file=sys.stderr, end="") + print("runnning 4", file=sys.stderr) # Initial eventloops for asyncio for this actor. if core_worker.current_actor_is_asyncio(): core_worker.initialize_eventloops_for_actor_concurrency_group( c_defined_concurrency_groups) + print("runnning 5", file=sys.stderr) execution_info = execution_infos.get(function_descriptor) if not execution_info: execution_info = manager.get_execution_info( @@ -2253,6 +2257,7 @@ cdef execute_task_with_cancellation_handler( with current_task_id_lock: current_task_id = task_id + print("runnning 6", file=sys.stderr) execute_task(caller_address, task_type, name, @@ -2274,6 +2279,7 @@ cdef execute_task_with_cancellation_handler( should_retry_exceptions, generator_backpressure_num_objects, c_tensor_transport) + print("runnning 7", file=sys.stderr) # Check for cancellation. PyErr_CheckSignals() @@ -2303,6 +2309,7 @@ cdef execute_task_with_cancellation_handler( # cancel tasks to fail. NULL) finally: + print("runnning 9", file=sys.stderr) with current_task_id_lock: current_task_id = None From 4918b627a3195b0bbb5ca15bb8c96f1101e474bc Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 17 Sep 2025 21:33:04 -0700 Subject: [PATCH 08/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3f63a8c7fa41..8aa33ff50741 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2215,8 +2215,11 @@ cdef execute_task_with_cancellation_handler( if task_type == TASK_TYPE_ACTOR_CREATION_TASK: print("runnning 2", file=sys.stderr) actor_class = manager.load_actor_class(job_id, function_descriptor) + print("runnning 2.1", file=sys.stderr) actor_id = core_worker.get_actor_id() + print("runnning 2.2", file=sys.stderr) actor = actor_class.__new__(actor_class) + print("runnning 2.3", file=sys.stderr) worker.actors[actor_id] = actor print("runnning 3", file=sys.stderr) From 97005dd07f2398317ec0b04fe14e944c09c105f1 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Thu, 18 Sep 2025 21:33:29 -0700 Subject: [PATCH 09/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8aa33ff50741..98a94a0cc46b 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2215,13 +2215,11 @@ cdef execute_task_with_cancellation_handler( if task_type == TASK_TYPE_ACTOR_CREATION_TASK: print("runnning 2", file=sys.stderr) actor_class = manager.load_actor_class(job_id, function_descriptor) - print("runnning 2.1", file=sys.stderr) actor_id = core_worker.get_actor_id() print("runnning 2.2", file=sys.stderr) actor = actor_class.__new__(actor_class) print("runnning 2.3", file=sys.stderr) worker.actors[actor_id] = actor - print("runnning 3", file=sys.stderr) # Record the actor class via :actor_name: magic token in the log. # From 63e4c7e3fbd9443016a299877f1c01d98940ebcb Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Fri, 19 Sep 2025 07:08:16 -0700 Subject: [PATCH 10/20] up Signed-off-by: Jiajun Yao --- python/ray/_private/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index 5b52a66ad9f7..7793787e488f 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -235,11 +235,9 @@ def set_omp_num_threads_if_unset() -> bool: Returns True if OMP_NUM_THREADS is set in this function. """ - print("set_omp_num_threads_if_unset start", file=sys.stderr) num_threads_from_env = os.environ.get("OMP_NUM_THREADS") if num_threads_from_env is not None: # No ops if it's set - print("set_omp_num_threads_if_unset end", file=sys.stderr) return False # If unset, try setting the correct CPU count assigned. @@ -265,7 +263,6 @@ def set_omp_num_threads_if_unset() -> bool: # For num_cpus >= 1: Set to the floor of the actual assigned cpus. omp_num_threads = max(math.floor(num_assigned_cpus), 1) os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) - print("set_omp_num_threads_if_unset end", file=sys.stderr) return True @@ -277,7 +274,6 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: """ from ray._private.ray_constants import env_bool - print("set_visible_accelerator_ids start", file=sys.stderr) original_visible_accelerator_env_vars = {} override_on_zero = env_bool( ray._private.accelerators.RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO_ENV_VAR, @@ -297,7 +293,6 @@ def set_visible_accelerator_ids() -> Mapping[str, Optional[str]]: resource_name ).set_current_process_visible_accelerator_ids(accelerator_ids) - print("set_visible_accelerator_ids end", file=sys.stderr) return original_visible_accelerator_env_vars @@ -305,13 +300,11 @@ def reset_visible_accelerator_env_vars( original_visible_accelerator_env_vars: Mapping[str, Optional[str]] ) -> None: """Reset the visible accelerator env vars to the original values.""" - print("reset_visible_accelerator_env_vars start", file=sys.stderr) for env_var, env_value in original_visible_accelerator_env_vars.items(): if env_value is None: os.environ.pop(env_var, None) else: os.environ[env_var] = env_value - print("reset_visible_accelerator_env_vars end", file=sys.stderr) class Unbuffered(object): From a49af03b42e77fdab8fef0472a2582cb09b07857 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Fri, 19 Sep 2025 16:15:27 -0700 Subject: [PATCH 11/20] pu Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 98a94a0cc46b..5ea23fea2bad 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2211,7 +2211,6 @@ cdef execute_task_with_cancellation_handler( # in case executing the main task throws an exception. function_descriptor = CFunctionDescriptorToPython( ray_function.GetFunctionDescriptor()) - print("runnning 1", file=sys.stderr) if task_type == TASK_TYPE_ACTOR_CREATION_TASK: print("runnning 2", file=sys.stderr) actor_class = manager.load_actor_class(job_id, function_descriptor) @@ -2232,14 +2231,12 @@ cdef execute_task_with_cancellation_handler( # Flush to both .out and .err print(actor_magic_token, end="") print(actor_magic_token, file=sys.stderr, end="") - print("runnning 4", file=sys.stderr) # Initial eventloops for asyncio for this actor. if core_worker.current_actor_is_asyncio(): core_worker.initialize_eventloops_for_actor_concurrency_group( c_defined_concurrency_groups) - print("runnning 5", file=sys.stderr) execution_info = execution_infos.get(function_descriptor) if not execution_info: execution_info = manager.get_execution_info( @@ -2258,7 +2255,6 @@ cdef execute_task_with_cancellation_handler( with current_task_id_lock: current_task_id = task_id - print("runnning 6", file=sys.stderr) execute_task(caller_address, task_type, name, @@ -2280,7 +2276,6 @@ cdef execute_task_with_cancellation_handler( should_retry_exceptions, generator_backpressure_num_objects, c_tensor_transport) - print("runnning 7", file=sys.stderr) # Check for cancellation. PyErr_CheckSignals() From 29c8456f6c9bd46178c150d912d27c416843676d Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Sat, 20 Sep 2025 07:25:21 -0700 Subject: [PATCH 12/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 5ea23fea2bad..de608d7581d8 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2217,7 +2217,6 @@ cdef execute_task_with_cancellation_handler( actor_id = core_worker.get_actor_id() print("runnning 2.2", file=sys.stderr) actor = actor_class.__new__(actor_class) - print("runnning 2.3", file=sys.stderr) worker.actors[actor_id] = actor # Record the actor class via :actor_name: magic token in the log. @@ -2315,9 +2314,7 @@ cdef execute_task_with_cancellation_handler( ray._private.utils.reset_visible_accelerator_env_vars(original_visible_accelerator_env_vars) if omp_num_threads_overriden: # Reset the OMP_NUM_THREADS environ if it was set. - print("pop omp start", file=sys.stderr) os.environ.pop("OMP_NUM_THREADS", None) - print("pop omp end", file=sys.stderr) if execution_info.max_calls != 0: From bd855c537fabb8e5c5821f31cfc4175055382eac Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Sun, 21 Sep 2025 08:12:57 -0700 Subject: [PATCH 13/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index de608d7581d8..619265e3f5cc 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2212,7 +2212,6 @@ cdef execute_task_with_cancellation_handler( function_descriptor = CFunctionDescriptorToPython( ray_function.GetFunctionDescriptor()) if task_type == TASK_TYPE_ACTOR_CREATION_TASK: - print("runnning 2", file=sys.stderr) actor_class = manager.load_actor_class(job_id, function_descriptor) actor_id = core_worker.get_actor_id() print("runnning 2.2", file=sys.stderr) From 54bf744d053ebcc6adabfc732a767f8c3015bc5c Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Sun, 21 Sep 2025 21:40:51 -0700 Subject: [PATCH 14/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 619265e3f5cc..eef887a76ee9 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2214,7 +2214,6 @@ cdef execute_task_with_cancellation_handler( if task_type == TASK_TYPE_ACTOR_CREATION_TASK: actor_class = manager.load_actor_class(job_id, function_descriptor) actor_id = core_worker.get_actor_id() - print("runnning 2.2", file=sys.stderr) actor = actor_class.__new__(actor_class) worker.actors[actor_id] = actor From 90f3dd5aa81a8d5101a0ac6f4e33734c95de8361 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 22 Sep 2025 09:39:39 -0700 Subject: [PATCH 15/20] up Signed-off-by: Jiajun Yao --- python/ray/_raylet.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index eef887a76ee9..995a417b472f 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -2302,7 +2302,6 @@ cdef execute_task_with_cancellation_handler( # cancel tasks to fail. NULL) finally: - print("runnning 9", file=sys.stderr) with current_task_id_lock: current_task_id = None From 3c8bbdcad9d43a14971aa311a6f9b80fa4797ece Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Thu, 2 Oct 2025 12:51:37 -0700 Subject: [PATCH 16/20] revert Signed-off-by: Jiajun Yao --- BUILD.bazel | 2 +- ci/build/build-manylinux-wheel.sh | 2 -- python/ray/actor.py | 8 -------- python/setup.py | 2 +- src/ray/core_worker/core_worker.cc | 1 - 5 files changed, 2 insertions(+), 13 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index aacf4f1ef10c..c30d1b1f43ae 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,4 +1,4 @@ -# Bazel build test +# Bazel build # C/C++ documentation: https://docs.bazel.build/versions/master/be/c-cpp.html # ************************** IMPORTANT *********************** diff --git a/ci/build/build-manylinux-wheel.sh b/ci/build/build-manylinux-wheel.sh index 8a067152c0b5..b2b1abdadaa7 100755 --- a/ci/build/build-manylinux-wheel.sh +++ b/ci/build/build-manylinux-wheel.sh @@ -26,8 +26,6 @@ export BAZEL_PATH="$HOME"/bin/bazel # This is required for building with bazel. sudo ln -sf "/opt/python/${PYTHON}/bin/python3" /usr/local/bin/python3 -# export RAY_DEBUG_BUILD=debug - # build ray wheel PATH="/opt/python/${PYTHON}/bin:$PATH" RAY_INSTALL_JAVA=0 \ "/opt/python/${PYTHON}/bin/python" -m pip wheel -v -w dist . --no-deps diff --git a/python/ray/actor.py b/python/ray/actor.py index 2cbc1bea6558..07f808c90352 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -1152,10 +1152,6 @@ def _process_option_dict(actor_options, has_tensor_transport_methods): # enable tensor transport, unless it was explicitly set to False by the # user. if has_tensor_transport_methods: - import os - - print("jjyaooooooo cannot happen") - os._exit(1) if _filled_options["enable_tensor_transport"] is False: raise ValueError( "Actor class has methods with @ray.method(tensor_transport=...) decorator but @ray.remote(enable_tensor_transport=False). " @@ -1174,10 +1170,6 @@ def _process_option_dict(actor_options, has_tensor_transport_methods): # https://github.com/ray-project/ray/issues/54639 is fixed. enable_tensor_transport = _filled_options.get("enable_tensor_transport", False) if enable_tensor_transport: - import os - - print("jjyaooooooo cannot happen") - os._exit(1) if _filled_options.get("concurrency_groups", None) is None: _filled_options["concurrency_groups"] = {} _filled_options["concurrency_groups"]["_ray_system"] = 1 diff --git a/python/setup.py b/python/setup.py index 671ea0a3ea5a..60024b4b8c1f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -83,7 +83,7 @@ def __init__( version = find_version("ray", "_version.py") # add .dbg suffix if debug mode is on. if build_type == BuildType.DEBUG: - self.version: str = f"{version}" + self.version: str = f"{version}+dbg" elif build_type == BuildType.ASAN: self.version: str = f"{version}+asan" elif build_type == BuildType.TSAN: diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index aae9ec579bc9..62ef51ae0bdc 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -2854,7 +2854,6 @@ Status CoreWorker::ExecuteTask( /*generator_backpressure_num_objects=*/ task_spec.GeneratorBackpressureNumObjects(), /*tensor_transport=*/task_spec.TensorTransport()); - RAY_LOG(INFO) << "Task execution done"; // Get the reference counts for any IDs that we borrowed during this task, // remove the local reference for these IDs, and return the ref count info to From dad6e54bfad15bb9ebe2b3a8699d0c6f74cd1073 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 24 Nov 2025 13:08:55 -0800 Subject: [PATCH 17/20] up Signed-off-by: Jiajun Yao --- .buildkite/core.rayci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index 03fd22cd934c..ff4b444968e2 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -367,7 +367,8 @@ steps: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/... //doc/... core --install-mask all-ray-libraries --run-flaky-tests - --except-tags multi_gpu,cgroup + --except-tags multi_gpu,cgroup || true + - sleep 10000000 depends_on: - corebuild From b37eb8af71e56fbd91624d3464fcb8e702868053 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 24 Nov 2025 15:41:36 -0800 Subject: [PATCH 18/20] up Signed-off-by: Jiajun Yao --- .buildkite/core.rayci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index ff4b444968e2..fb446ed5d5ad 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -355,6 +355,19 @@ steps: - block-core-cpp-sanitizer-tests - corebuild + - label: ":ray: core: jjyao tests" + key: core_jjyao_flaky_tests + tags: + - python + instance_type: large + commands: + - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests:test_out_of_disk_space core + --install-mask all-ray-libraries + --except-tags multi_gpu,cgroup || true + - sleep 10000000 + depends_on: + - corebuild + - label: ":ray: core: flaky tests" key: core_flaky_tests tags: From 4ed73ddcb2d5749acf1b1c3e7ccc7a17115b7ade Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Tue, 25 Nov 2025 10:54:50 -0800 Subject: [PATCH 19/20] Deflake test_put_out_of_disk Signed-off-by: Jiajun Yao --- .buildkite/core.rayci.yml | 16 +--------------- python/ray/tests/test_out_of_disk_space.py | 7 +++---- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index fb446ed5d5ad..03fd22cd934c 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -355,19 +355,6 @@ steps: - block-core-cpp-sanitizer-tests - corebuild - - label: ":ray: core: jjyao tests" - key: core_jjyao_flaky_tests - tags: - - python - instance_type: large - commands: - - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests:test_out_of_disk_space core - --install-mask all-ray-libraries - --except-tags multi_gpu,cgroup || true - - sleep 10000000 - depends_on: - - corebuild - - label: ":ray: core: flaky tests" key: core_flaky_tests tags: @@ -380,8 +367,7 @@ steps: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/... //doc/... core --install-mask all-ray-libraries --run-flaky-tests - --except-tags multi_gpu,cgroup || true - - sleep 10000000 + --except-tags multi_gpu,cgroup depends_on: - corebuild diff --git a/python/ray/tests/test_out_of_disk_space.py b/python/ray/tests/test_out_of_disk_space.py index 3f29fc8757bf..5e290040521c 100644 --- a/python/ray/tests/test_out_of_disk_space.py +++ b/python/ray/tests/test_out_of_disk_space.py @@ -10,6 +10,7 @@ import pytest import ray +from ray._common.test_utils import wait_for_condition from ray.util.state import list_cluster_events @@ -57,14 +58,12 @@ def test_put_out_of_disk(shutdown_only): # ray.put doesn't work is that fallback allocation uses mmaped file # that doesn't neccssary allocate disk spaces. with create_tmp_file(250 * 1024 * 1024): - assert get_current_usage() > local_fs_capacity_threshold - time.sleep(1) + wait_for_condition(lambda: get_current_usage() > local_fs_capacity_threshold) with pytest.raises(ray.exceptions.OutOfDiskError): ray.put(np.random.rand(20 * 1024 * 1024)) # delete tmp file to reclaim space back. - assert get_current_usage() < local_fs_capacity_threshold - time.sleep(1) + wait_for_condition(lambda: get_current_usage() < local_fs_capacity_threshold) ray.put(np.random.rand(20 * 1024 * 1024)) From 2a709b358ad87ba42008f128854129cdbbc5b7cb Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Tue, 25 Nov 2025 11:35:23 -0800 Subject: [PATCH 20/20] up Signed-off-by: Jiajun Yao --- python/ray/tests/test_out_of_disk_space.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ray/tests/test_out_of_disk_space.py b/python/ray/tests/test_out_of_disk_space.py index 5e290040521c..cf882bbee50b 100644 --- a/python/ray/tests/test_out_of_disk_space.py +++ b/python/ray/tests/test_out_of_disk_space.py @@ -63,6 +63,9 @@ def test_put_out_of_disk(shutdown_only): ray.put(np.random.rand(20 * 1024 * 1024)) # delete tmp file to reclaim space back. + # Ideally get_current_usage() should immediately reflect the latest disk usage + # after the tmp file is deleted, but somehow there is some delays on CI machines + # so I use wait_for_condition here. wait_for_condition(lambda: get_current_usage() < local_fs_capacity_threshold) ray.put(np.random.rand(20 * 1024 * 1024))