TensorFlow/resnet50v1.5/scripts/multi_node_gpu.yaml

# Training configuration for ResNet trained on ImageNet on GPUs.
# Reaches > 76.1% within 90 epochs.
# Note: This configuration uses a scaled per-replica batch size based on the number of devices.


runtime:
  distribution_strategy: 'multi_worker_mirrored'
  worker_hosts: '10.11.0.2:11111,10.11.0.3:11111,10.11.0.4:11111,10.11.0.5:11111'
  num_gpus: 8
  task_index: 0
  all_reduce_alg: 'nccl'
  batchnorm_spatial_persistent: False
train_dataset:
  name: 'imagenet2012'
  data_dir: null
  builder: 'records'
  split: 'train'
  image_size: 224
  num_classes: 1000
  num_examples: 640512
  batch_size: 128
  use_per_replica_batch_size: True
  dtype: 'float32'
  mean_subtract: True
  standardize: True
validation_dataset:
  name: 'imagenet2012'
  data_dir: null
  builder: 'records'
  split: 'validation'
  image_size: 224
  num_classes: 1000
  num_examples: 50000
  batch_size: 128
  use_per_replica_batch_size: True
  dtype: 'float32'
  mean_subtract: True
  standardize: True
model:
  name: 'resnet'
  model_params:
    rescale_inputs: False
  optimizer:
    name: 'momentum'
    momentum: 0.875
    decay: 0.9
    epsilon: 0.001
  learning_rate:
    initial_lr: 1.024
    name: 'piecewise_constant_with_warmup'
    examples_per_epoch: 640512
    warmup_epochs: 1
  loss:
    label_smoothing: 0.1
train:
  resume_checkpoint: False
  epochs: 1
  steps: 600
evaluation:
  epochs_between_evals: 10