cluster.yaml.template

# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal-i3

# The maximum number of workers nodes to launch in addition to the head
# node. min_workers default to 0.
max_workers: {num_workers}
min_workers: {num_workers}

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 0
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            #InstanceType: i3.8xlarge
            InstanceType: i3.8xlarge
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 1000
            # Additional options in the boto docs.

    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: {num_workers}
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: {num_workers}
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: i3.8xlarge
            ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
            # Run workers on spot by default. Comment this out to use on-demand.
            #InstanceMarketOptions:
            #    MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {}

setup_commands:
    - source activate dask-38 || (conda create -n dask-38 -y python=3.8)
    - echo 'export PATH="$HOME/anaconda3/envs/dask-38/bin:$PATH"' >> ~/.bashrc
    - echo 'export PATH="$HOME/anaconda3/envs/dask-38/bin:$PATH"' >> ~/.profile
    - $HOME/anaconda3/envs/dask-38/bin/pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/master/b986938f0ff22f86cb87aab9f3e0d4fa36457b8f/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl --ignore-installed
    - $HOME/anaconda3/envs/dask-38/bin/pip install --upgrade dask==2021.3.0 distributed==2021.3.0 s3fs boto3 click==7 pyarrow fastparquet --ignore-installed
    - if ! sudo mountpoint -q /data0; then sudo mkfs -t ext4 /dev/nvme0n1 && sudo mkdir -p /data0 && sudo mount /dev/nvme0n1 /data0 && sudo chown -R ubuntu:ubuntu /data0 && sudo chmod 777 /data0; fi
    - sudo chown -R ubuntu:ubuntu /data0 && sudo chmod 777 /data0
    - if ! sudo mountpoint -q /data1; then sudo mkfs -t ext4 /dev/nvme1n1 && sudo mkdir -p /data1 && sudo mount /dev/nvme1n1 /data1 && sudo chown -R ubuntu:ubuntu /data1 && sudo chmod 777 /data1; fi
    - sudo chown -R ubuntu:ubuntu /data1 && sudo chmod 777 /data1
    - if ! sudo mountpoint -q /data2; then sudo mkfs -t ext4 /dev/nvme2n1 && sudo mkdir -p /data2 && sudo mount /dev/nvme2n1 /data2 && sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2; fi
    - sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2
    - if ! sudo mountpoint -q /data3; then sudo mkfs -t ext4 /dev/nvme3n1 && sudo mkdir -p /data3 && sudo mount /dev/nvme3n1 /data3 && sudo chown -R ubuntu:ubuntu /data3 && sudo chmod 777 /data3; fi
    - sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2
    - rm -rf dask-benchmarks && git clone https://github.com/stephanie-wang/dask-on-ray-blog.git dask-benchmarks

head_start_ray_commands:
    - ray stop -f
    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config ~/ray_bootstrap_config.yaml --system-config='{"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":[\"/data0/spill\",\"/data1/spill\",\"/data2/spill\",\"/data3/spill\"]}}","plasma_unlimited":true}'


# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop -f
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076