diff --git a/python/ray/util/spark/cluster_init.py b/python/ray/util/spark/cluster_init.py index 129f1677b6b8..ee0bcadf1ecc 100644 --- a/python/ray/util/spark/cluster_init.py +++ b/python/ray/util/spark/cluster_init.py @@ -703,6 +703,7 @@ def _setup_ray_cluster( extra_env={ RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or "", RAY_ON_SPARK_START_RAY_PARENT_PID: str(os.getpid()), + **start_hook.custom_environment_variables(), }, ) spark_job_server = None @@ -1511,10 +1512,13 @@ def ray_cluster_job_mapper(_): if ray_temp_dir is not None: ray_worker_node_cmd.append(f"--temp-dir={ray_temp_dir}") + hook_entry = _create_hook_entry(is_global=(ray_temp_dir is None)) + ray_worker_node_extra_envs = { RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or "", RAY_ON_SPARK_START_RAY_PARENT_PID: str(os.getpid()), "RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER": "1", + **hook_entry.custom_environment_variables(), } if num_gpus_per_node > 0: @@ -1780,10 +1784,13 @@ def start( ) ray_head_node_cmd.extend(_convert_ray_node_options(head_node_options)) + hook_entry = _create_hook_entry(is_global=(ray_temp_dir is None)) + extra_env = { "AUTOSCALER_UPDATE_INTERVAL_S": "1", RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or "", RAY_ON_SPARK_START_RAY_PARENT_PID: str(os.getpid()), + **hook_entry.custom_environment_variables(), } self.ray_head_node_cmd = ray_head_node_cmd diff --git a/python/ray/util/spark/databricks_hook.py b/python/ray/util/spark/databricks_hook.py index 750ffb13d9b4..97aa120ac7a0 100644 --- a/python/ray/util/spark/databricks_hook.py +++ b/python/ray/util/spark/databricks_hook.py @@ -167,3 +167,13 @@ def auto_shutdown_watcher(): def on_spark_job_created(self, job_group_id): db_api_entry = get_db_entry_point() db_api_entry.registerBackgroundSparkJobGroup("job_group_id") + + def custom_environment_variables(self): + """Hardcode `GLOO_SOCKET_IFNAME` to `eth0` for Databricks runtime. + + Torch on DBR does not reliably detect the correct interface to use, + and ends up selecting the loopback interface, breaking cross-node + commnication.""" + return { + "GLOO_SOCKET_IFNAME": "eth0", + } diff --git a/python/ray/util/spark/start_hook_base.py b/python/ray/util/spark/start_hook_base.py index a45c216d6659..d51dbec3a02b 100644 --- a/python/ray/util/spark/start_hook_base.py +++ b/python/ray/util/spark/start_hook_base.py @@ -13,3 +13,6 @@ def on_cluster_created(self, ray_cluster_handler): def on_spark_job_created(self, job_group_id): pass + + def custom_environment_variables(self): + return {}