-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathmulti_node_gpu.yaml
60 lines (58 loc) · 1.36 KB
/
multi_node_gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Training configuration for ResNet trained on ImageNet on GPUs.
# Reaches > 76.1% within 90 epochs.
# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
runtime:
distribution_strategy: 'multi_worker_mirrored'
worker_hosts: '10.11.0.2:11111,10.11.0.3:11111,10.11.0.4:11111,10.11.0.5:11111'
num_gpus: 8
task_index: 0
all_reduce_alg: 'nccl'
batchnorm_spatial_persistent: False
train_dataset:
name: 'imagenet2012'
data_dir: null
builder: 'records'
split: 'train'
image_size: 224
num_classes: 1000
num_examples: 640512
batch_size: 128
use_per_replica_batch_size: True
dtype: 'float32'
mean_subtract: True
standardize: True
validation_dataset:
name: 'imagenet2012'
data_dir: null
builder: 'records'
split: 'validation'
image_size: 224
num_classes: 1000
num_examples: 50000
batch_size: 128
use_per_replica_batch_size: True
dtype: 'float32'
mean_subtract: True
standardize: True
model:
name: 'resnet'
model_params:
rescale_inputs: False
optimizer:
name: 'momentum'
momentum: 0.875
decay: 0.9
epsilon: 0.001
learning_rate:
initial_lr: 1.024
name: 'piecewise_constant_with_warmup'
examples_per_epoch: 640512
warmup_epochs: 1
loss:
label_smoothing: 0.1
train:
resume_checkpoint: False
epochs: 1
steps: 600
evaluation:
epochs_between_evals: 10