From 42880e81423f411194d568c211586348d05f9473 Mon Sep 17 00:00:00 2001 From: Fangchi Wang Date: Fri, 8 Sep 2023 02:55:45 +0800 Subject: [PATCH] [Doc] Add vSphere cluster configuration reference with examples (#39379) Similar to other providers, we add example-minimal.yaml and example-full.yaml to vSphere autoscaler. And we add and refine vSphere related references in the Getting Started guide as well as the cluster configuration reference page, based on the newly added examples. Why are these changes needed? In PR #37815 we've added vSphere platform support to Ray Autoscaler. However, the related documents are not sufficient. This follow-up change adds related examples similar to other platforms. The related documents including the getting-started guide as well as the cluster configuration reference also need to be updated to include descriptions specific for vSphere. We will do another follow-up PR to add a "Launching Ray Clusters on vSphere" user guide at https://docs.ray.io/en/latest/cluster/vms/user-guides/launching-clusters/index.html Signed-off-by: Fangchi Wang --- doc/source/cluster/vms/getting-started.rst | 26 +-- .../references/ray-cluster-configuration.rst | 206 ++++++++++++++++++ .../vms/user-guides/community/index.rst | 4 +- python/ray/autoscaler/vsphere/defaults.yaml | 23 +- .../ray/autoscaler/vsphere/example-full.yaml | 132 +++++++++++ .../autoscaler/vsphere/example-minimal.yaml | 6 + 6 files changed, 365 insertions(+), 32 deletions(-) create mode 100644 python/ray/autoscaler/vsphere/example-full.yaml create mode 100755 python/ray/autoscaler/vsphere/example-minimal.yaml diff --git a/doc/source/cluster/vms/getting-started.rst b/doc/source/cluster/vms/getting-started.rst index 71b2898f9522c..09df4ef0ebe55 100644 --- a/doc/source/cluster/vms/getting-started.rst +++ b/doc/source/cluster/vms/getting-started.rst @@ -109,7 +109,7 @@ Next, if you're not set up to use your cloud provider from the command line, you .. code-block:: shell - $ export VSPHERE_SERVER=192.168.0.1 # Enter your vSphere IP + $ export VSPHERE_SERVER=192.168.0.1 # Enter your vSphere vCenter Address $ export VSPHERE_USER=user # Enter your username $ export VSPHERE_PASSWORD=password # Enter your password @@ -262,28 +262,8 @@ A minimal sample cluster configuration file looks as follows: .. tab:: vSphere - .. code-block:: yaml - - # A unique identifier for the head node and workers of this cluster. - cluster_name: minimal - - # Cloud-provider specific configuration. - provider: - type: vsphere - - auth: - ssh_user: ray # The VMs are initialised with an user called ray. - - available_node_types: - ray.head.default: - node_config: - resource_pool: ray # Resource pool where the Ray cluster will get created - library_item: ray-head-debian # OVF file name from which the head will be created - - worker: - node_config: - clone: True # If True, all the workers will be instant-cloned from a frozen VM - library_item: ray-frozen-debian # The OVF file from which a frozen VM will be created + .. literalinclude:: ../../../../python/ray/autoscaler/vsphere/example-minimal.yaml + :language: yaml Save this configuration file as ``config.yaml``. You can specify a lot more details in the configuration file: instance types to use, minimum and maximum number of workers to start, autoscaling strategy, files to sync, and more. For a full reference on the available configuration properties, please refer to the :ref:`cluster YAML configuration options reference `. diff --git a/doc/source/cluster/vms/references/ray-cluster-configuration.rst b/doc/source/cluster/vms/references/ray-cluster-configuration.rst index dffe95234140e..d821db8e2b765 100644 --- a/doc/source/cluster/vms/references/ray-cluster-configuration.rst +++ b/doc/source/cluster/vms/references/ray-cluster-configuration.rst @@ -96,6 +96,12 @@ Auth :ref:`ssh_user `: str :ref:`ssh_private_key `: str + .. tab-item:: vSphere + + .. parsed-literal:: + + :ref:`ssh_user `: str + .. _cluster-configuration-provider-type: Provider @@ -137,6 +143,14 @@ Provider :ref:`cache_stopped_nodes `: bool :ref:`use_internal_ips `: bool + .. tab-item:: vSphere + + .. parsed-literal:: + + :ref:`type `: str + :ref:`vsphere_config `: + :ref:`vSphere Config ` + .. _cluster-configuration-security-group-type: Security Group @@ -152,6 +166,35 @@ Security Group :ref:`IpPermissions `: - `IpPermission `_ +.. _cluster-configuration-vsphere-config-type: + +vSphere Config +~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: vSphere + + .. parsed-literal:: + + :ref:`credentials `: + :ref:`vSphere Credentials ` + +.. _cluster-configuration-vsphere-credentials-type: + +vSphere Credentials +~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: vSphere + + .. parsed-literal:: + + :ref:`user `: str + :ref:`password `: str + :ref:`server `: str + .. _cluster-configuration-node-types-type: Node types @@ -204,6 +247,20 @@ nodes with the newly applied ``node_config`` will then be created according to c A YAML object as defined in `the GCP docs `_. + .. tab-item:: vSphere + + .. parsed-literal:: + + # The resource pool where the head node should live, if unset, will be + # the frozen VM's resource pool. + resource_pool: str + # Mandatory: The frozen VM name from which the head node will be instant-cloned. + frozen_vm_name: str + # The datastore to store the vmdk of the head node vm, if unset, will be + # the frozen VM's datastore. + datastore: str + + .. _cluster-configuration-node-docker-type: Node Docker @@ -738,6 +795,10 @@ The user that Ray will authenticate with when launching new nodes. * **Importance:** Low * **Type:** String + .. tab-item:: vSphere + + Not available. The vSphere provider expects the key to be located at a fixed path ``~/ray-bootstrap-key.pem`` and will automatically generate one if not found. + .. _cluster-configuration-ssh-public-key: ``auth.ssh_public_key`` @@ -761,6 +822,10 @@ The user that Ray will authenticate with when launching new nodes. Not available. + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-type: ``provider.type`` @@ -792,6 +857,14 @@ The user that Ray will authenticate with when launching new nodes. * **Importance:** High * **Type:** String + .. tab-item:: vSphere + + The cloud service provider. For vSphere and VCF, this must be set to ``vsphere``. + + * **Required:** Yes + * **Importance:** High + * **Type:** String + .. _cluster-configuration-region: ``provider.region`` @@ -821,6 +894,10 @@ The user that Ray will authenticate with when launching new nodes. * **Type:** String * **Default:** us-west1 + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-availability-zone: ``provider.availability_zone`` @@ -852,6 +929,10 @@ The user that Ray will authenticate with when launching new nodes. * **Type:** String * **Default:** us-west1-a + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-location: ``provider.location`` @@ -876,6 +957,10 @@ The user that Ray will authenticate with when launching new nodes. Not available. + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-resource-group: ``provider.resource_group`` @@ -900,6 +985,10 @@ The user that Ray will authenticate with when launching new nodes. Not available. + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-subscription-id: ``provider.subscription_id`` @@ -924,6 +1013,10 @@ The user that Ray will authenticate with when launching new nodes. Not available. + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-project-id: ``provider.project_id`` @@ -948,6 +1041,10 @@ The user that Ray will authenticate with when launching new nodes. * **Type:** String * **Default:** ``null`` + .. tab-item:: vSphere + + Not available. + .. _cluster-configuration-cache-stopped-nodes: ``provider.cache_stopped_nodes`` @@ -1005,6 +1102,37 @@ controlled by your cloud provider's configuration. Not available. + .. tab-item:: vSphere + + Not available. + +.. _cluster-configuration-vsphere-config: + +``provider.vsphere_config`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: AWS + + Not available. + + .. tab-item:: Azure + + Not available. + + .. tab-item:: GCP + + Not available. + + .. tab-item:: vSphere + + vSphere configuations used to connect vCenter Server. If not configured, + the VSPHERE_* environment variables will be used. + + * **Required:** No + * **Importance:** Low + * **Type:** :ref:`vSphere Config ` .. _cluster-configuration-group-name: @@ -1029,6 +1157,50 @@ The inbound rules associated with the security group. * **Importance:** Medium * **Type:** `IpPermission `_ +.. _cluster-configuration-vsphere-credentials: + +``vsphere_config.credentials`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The credential to connect to the vSphere vCenter Server. + +* **Required:** No +* **Importance:** Low +* **Type:** :ref:`vSphere Credentials ` + +.. _cluster-configuration-vsphere-user: + +``vsphere_config.credentials.user`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Username to connect to vCenter Server. + +* **Required:** No +* **Importance:** Low +* **Type:** String + +.. _cluster-configuration-vsphere-password: + +``vsphere_config.credentials.password`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Password of the user to connect to vCenter Server. + +* **Required:** No +* **Importance:** Low +* **Type:** String + +.. _cluster-configuration-vsphere-server: + +``vsphere_config.credentials.server`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The vSphere vCenter Server address. + +* **Required:** No +* **Importance:** Low +* **Type:** String + .. _cluster-configuration-node-config: ``available_node_types..node_type.node_config`` @@ -1127,6 +1299,14 @@ A list of commands to run to set up worker nodes of this type. These commands wi * **Importance:** High * **Type:** Integer + .. tab-item:: vSphere + + The number of CPUs made available by this node. If not configured, the nodes will use the same settings as the frozen VM. + + * **Required:** No + * **Importance:** High + * **Type:** Integer + .. _cluster-configuration-gpu: @@ -1193,6 +1373,14 @@ A list of commands to run to set up worker nodes of this type. These commands wi * **Importance:** High * **Type:** Integer + .. tab-item:: vSphere + + The memory in bytes allocated for python worker heap memory on the node. + If not configured, the node will use the same memory settings as the frozen VM. + + * **Required:** No + * **Importance:** High + * **Type:** Integer .. _cluster-configuration-object-store-memory: @@ -1225,6 +1413,14 @@ A list of commands to run to set up worker nodes of this type. These commands wi * **Importance:** High * **Type:** Integer + .. tab-item:: vSphere + + The memory in bytes allocated for the object store on the node. + + * **Required:** No + * **Importance:** High + * **Type:** Integer + .. _cluster-configuration-node-docker: ``available_node_types..docker`` @@ -1260,6 +1456,11 @@ Minimal configuration .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-minimal.yaml :language: yaml + .. tab-item:: vSphere + + .. literalinclude:: ../../../../../python/ray/autoscaler/vsphere/example-minimal.yaml + :language: yaml + Full configuration ~~~~~~~~~~~~~~~~~~ @@ -1280,6 +1481,11 @@ Full configuration .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-full.yaml :language: yaml + .. tab-item:: vSphere + + .. literalinclude:: ../../../../../python/ray/autoscaler/vsphere/example-full.yaml + :language: yaml + TPU Configuration ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/cluster/vms/user-guides/community/index.rst b/doc/source/cluster/vms/user-guides/community/index.rst index 5f72eedd6db99..2c5c25feb9f94 100644 --- a/doc/source/cluster/vms/user-guides/community/index.rst +++ b/doc/source/cluster/vms/user-guides/community/index.rst @@ -5,7 +5,7 @@ Community Supported Cluster Managers .. note:: - If you're using AWS, Azure or GCP you can use the :ref:`Ray cluster launcher ` to simplify the cluster setup process. + If you're using AWS, Azure, GCP or vSphere you can use the :ref:`Ray cluster launcher ` to simplify the cluster setup process. The following is a list of community supported cluster managers. @@ -22,7 +22,7 @@ The following is a list of community supported cluster managers. Using a custom cloud or cluster manager ======================================= -The Ray cluster launcher currently supports AWS, Azure, GCP, Aliyun and Kuberay out of the box. To use the Ray cluster launcher and Autoscaler on other cloud providers or cluster managers, you can implement the `node_provider.py `_ interface (100 LOC). +The Ray cluster launcher currently supports AWS, Azure, GCP, Aliyun, vSphere and Kuberay out of the box. To use the Ray cluster launcher and Autoscaler on other cloud providers or cluster managers, you can implement the `node_provider.py `_ interface (100 LOC). Once the node provider is implemented, you can register it in the `provider section `_ of the cluster launcher config. .. code-block:: yaml diff --git a/python/ray/autoscaler/vsphere/defaults.yaml b/python/ray/autoscaler/vsphere/defaults.yaml index 88f7eb7151f10..5f74d0afe093f 100755 --- a/python/ray/autoscaler/vsphere/defaults.yaml +++ b/python/ray/autoscaler/vsphere/defaults.yaml @@ -14,7 +14,15 @@ upscaling_speed: 1.0 # This executes all commands on all nodes in the docker container, # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. -docker: {} +docker: + image: "rayproject/ray-ml:latest" + # image: rayproject/ray:latest # use this one if you don't need ML dependencies, it's faster to pull + container_name: "ray_container" + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: # Extra options to pass into "docker run" + - --ulimit nofile=65536:65536 # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 @@ -23,12 +31,13 @@ idle_timeout_minutes: 5 provider: type: vsphere - vsphere_config: - credentials: - admin_user: administrator@vsphere.local - admin_password: vc_password - server: 10.186.231.56 - datacenter: Datacenter +# Credentials configured here will take precedence over credentials set in the +# environment variables. +# vsphere_config: +# credentials: +# user: vc_username +# password: vc_password +# server: vc_address # How Ray will authenticate with newly launched nodes. auth: diff --git a/python/ray/autoscaler/vsphere/example-full.yaml b/python/ray/autoscaler/vsphere/example-full.yaml new file mode 100644 index 0000000000000..c4044ee9576af --- /dev/null +++ b/python/ray/autoscaler/vsphere/example-full.yaml @@ -0,0 +1,132 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The maximum number of workers nodes to launch in addition to the head +# node. +max_workers: 2 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "rayproject/ray-ml:latest" + # image: rayproject/ray:latest # use this one if you don't need ML dependencies, it's faster to pull + container_name: "ray_container" + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: # Extra options to pass into "docker run" + - --ulimit nofile=65536:65536 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: vsphere + +# Credentials configured here will take precedence over credentials set in the +# environment variables. +# vsphere_config: +# credentials: +# user: vc_username +# password: vc_password +# server: vc_address + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ray +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + ray.head.default: + # For example, {"CPU": 4, "Memory": 8192} + resources: {} + node_config: + # The resource pool where the head node should live, if unset, will be + # the frozen VM's resource pool. + resource_pool: + # Mandatory: The frozen VM name from which the head node will be instant-cloned. + frozen_vm_name: frozen-vm + # The datastore to store the vmdk of the head node vm, if unset, will be + # the frozen VM's datastore. + datastore: + worker: + # The minimum number of nodes of this type to launch. + # This number should be >= 0. + min_workers: 1 + # For example, {"CPU": 4, "Memory": 8192} + resources: {} + node_config: + # The resource pool where the worker node should live, if unset, will be + # the frozen VM's resource pool. + resource_pool: + # Mandatory: The frozen VM name from which the work node will be instant-cloned. + frozen_vm_name: frozen-vm + # The datastore to store the vmdk(s) of the worker node vm(s), if unset, will be + # the frozen VM's datastore. + datastore: + +# Specify the node type of the head node (as configured above). +head_node_type: ray.head.default + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/vsphere/example-minimal.yaml b/python/ray/autoscaler/vsphere/example-minimal.yaml new file mode 100755 index 0000000000000..2f6952628553b --- /dev/null +++ b/python/ray/autoscaler/vsphere/example-minimal.yaml @@ -0,0 +1,6 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: minimal + +# Cloud-provider specific configuration. +provider: + type: vsphere