-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster.yaml.template
107 lines (97 loc) · 6.19 KB
/
cluster.yaml.template
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal-i3
# The maximum number of workers nodes to launch in addition to the head
# node. min_workers default to 0.
max_workers: {num_workers}
min_workers: {num_workers}
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
#InstanceType: i3.8xlarge
InstanceType: i3.8xlarge
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 1000
# Additional options in the boto docs.
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: {num_workers}
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: {num_workers}
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {}
# Provider-specific config for this node type, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
InstanceType: i3.8xlarge
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
# Run workers on spot by default. Comment this out to use on-demand.
#InstanceMarketOptions:
# MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {}
setup_commands:
- source activate dask-38 || (conda create -n dask-38 -y python=3.8)
- echo 'export PATH="$HOME/anaconda3/envs/dask-38/bin:$PATH"' >> ~/.bashrc
- echo 'export PATH="$HOME/anaconda3/envs/dask-38/bin:$PATH"' >> ~/.profile
- $HOME/anaconda3/envs/dask-38/bin/pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/master/b986938f0ff22f86cb87aab9f3e0d4fa36457b8f/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl --ignore-installed
- $HOME/anaconda3/envs/dask-38/bin/pip install --upgrade dask==2021.3.0 distributed==2021.3.0 s3fs boto3 click==7 pyarrow fastparquet --ignore-installed
- if ! sudo mountpoint -q /data0; then sudo mkfs -t ext4 /dev/nvme0n1 && sudo mkdir -p /data0 && sudo mount /dev/nvme0n1 /data0 && sudo chown -R ubuntu:ubuntu /data0 && sudo chmod 777 /data0; fi
- sudo chown -R ubuntu:ubuntu /data0 && sudo chmod 777 /data0
- if ! sudo mountpoint -q /data1; then sudo mkfs -t ext4 /dev/nvme1n1 && sudo mkdir -p /data1 && sudo mount /dev/nvme1n1 /data1 && sudo chown -R ubuntu:ubuntu /data1 && sudo chmod 777 /data1; fi
- sudo chown -R ubuntu:ubuntu /data1 && sudo chmod 777 /data1
- if ! sudo mountpoint -q /data2; then sudo mkfs -t ext4 /dev/nvme2n1 && sudo mkdir -p /data2 && sudo mount /dev/nvme2n1 /data2 && sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2; fi
- sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2
- if ! sudo mountpoint -q /data3; then sudo mkfs -t ext4 /dev/nvme3n1 && sudo mkdir -p /data3 && sudo mount /dev/nvme3n1 /data3 && sudo chown -R ubuntu:ubuntu /data3 && sudo chmod 777 /data3; fi
- sudo chown -R ubuntu:ubuntu /data2 && sudo chmod 777 /data2
- rm -rf dask-benchmarks && git clone https://github.com/stephanie-wang/dask-on-ray-blog.git dask-benchmarks
head_start_ray_commands:
- ray stop -f
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config ~/ray_bootstrap_config.yaml --system-config='{"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":[\"/data0/spill\",\"/data1/spill\",\"/data2/spill\",\"/data3/spill\"]}}","plasma_unlimited":true}'
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop -f
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076