update dockerfile and examples to v1alpha2 (#130)

kubeflow · Jul 9, 2019 · 635e145 · 635e145
1 parent 6f627a8
commit 635e145
Show file tree

Hide file tree

Showing 13 changed files with 174 additions and 89 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
-FROM golang:1.12.6-alpine3.10 AS build
+FROM golang:1.12.7-alpine3.10 AS build
 
 WORKDIR /go/src/github.com/kubeflow/mpi-operator/
 COPY . /go/src/github.com/kubeflow/mpi-operator/
-RUN go build -o /bin/mpi-operator github.com/kubeflow/mpi-operator/cmd/mpi-operator.v1alpha1
+RUN go build -o /bin/mpi-operator github.com/kubeflow/mpi-operator/cmd/mpi-operator.v1alpha2
 
 FROM alpine:3.10
 COPY --from=build /bin/mpi-operator /bin/mpi-operator

diff --git a/README.md b/README.md
@@ -39,66 +39,127 @@ ks apply ${ENVIRONMENT} -c mpi-operator
 Alternatively, you can deploy the operator with default settings without using ksonnet by running the following from the repo:
 
 ```shell
+kubectl create -f deploy/crd/crd-v1alpha2.yaml
 kubectl create -f deploy/
 ```
 
 ## Creating an MPI Job
 
-You can create an MPI job by defining an `MPIJob` config file. See [Tensorflow benchmark example](https://github.com/kubeflow/mpi-operator/blob/master/examples/tensorflow-benchmarks.yaml) config file for launching a multi-node TensorFlow benchmark training job. You may change the config file based on your requirements.
+You can create an MPI job by defining an `MPIJob` config file. See [Tensorflow benchmark example](https://github.com/kubeflow/mpi-operator/blob/master/examples/v1alpha2/tensorflow-benchmarks.yaml) config file for launching a multi-node TensorFlow benchmark training job. You may change the config file based on your requirements.
 
 ```
-cat examples/tensorflow-benchmarks.yaml
+cat examples/v1alpha2/tensorflow-benchmarks.yaml
 ```
 Deploy the `MPIJob` resource to start training:
 
 ```
-kubectl create -f examples/tensorflow-benchmarks.yaml
+kubectl create -f examples/v1alpha2/tensorflow-benchmarks.yaml
 ```
 
 ## Monitoring an MPI Job
 
 Once the `MPIJob` resource is created, you should now be able to see the created pods matching the specified number of GPUs. You can also monitor the job status from the status section. Here is sample output when the job is successfully completed.
 
 ```
-kubectl get -o yaml mpijobs tensorflow-benchmarks-16
+kubectl get -o yaml mpijobs tensorflow-benchmarks
 ```
 
 ```
-apiVersion: kubeflow.org/v1alpha1
+apiVersion: kubeflow.org/v1alpha2
 kind: MPIJob
 metadata:
-  clusterName: ""
-  creationTimestamp: 2019-01-07T20:32:12Z
+  creationTimestamp: "2019-07-09T22:15:51Z"
   generation: 1
-  name: tensorflow-benchmarks-16
+  name: tensorflow-benchmarks
   namespace: default
-  resourceVersion: "185051397"
-  selfLink: /apis/kubeflow.org/v1alpha1/namespaces/default/mpijobs/tensorflow-benchmarks-16
-  uid: 8dc8c044-127d-11e9-a419-02420bbe29f3
+  resourceVersion: "5645868"
+  selfLink: /apis/kubeflow.org/v1alpha2/namespaces/default/mpijobs/tensorflow-benchmarks
+  uid: 1c5b470f-a297-11e9-964d-88d7f67c6e6d
 spec:
-  gpus: 16
-  template:
-    metadata:
-      creationTimestamp: null
-    spec:
-      containers:
-      - image: mpioperator/tensorflow-benchmarks:latest
-        name: tensorflow-benchmarks
-        resources: {}
+  cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - command:
+            - mpirun
+            - --allow-run-as-root
+            - -np
+            - "2"
+            - -bind-to
+            - none
+            - -map-by
+            - slot
+            - -x
+            - NCCL_DEBUG=INFO
+            - -x
+            - LD_LIBRARY_PATH
+            - -x
+            - PATH
+            - -mca
+            - pml
+            - ob1
+            - -mca
+            - btl
+            - ^openib
+            - python
+            - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
+            - --model=resnet101
+            - --batch_size=64
+            - --variable_update=horovod
+            image: mpioperator/tensorflow-benchmarks:latest
+            name: tensorflow-benchmarks
+    Worker:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: mpioperator/tensorflow-benchmarks:latest
+            name: tensorflow-benchmarks
+            resources:
+              limits:
+                nvidia.com/gpu: 2
+  slotsPerWorker: 2
 status:
-  launcherStatus: Succeeded
+  completionTime: "2019-07-09T22:17:06Z"
+  conditions:
+  - lastTransitionTime: "2019-07-09T22:15:51Z"
+    lastUpdateTime: "2019-07-09T22:15:51Z"
+    message: MPIJob default/tensorflow-benchmarks is created.
+    reason: MPIJobCreated
+    status: "True"
+    type: Created
+  - lastTransitionTime: "2019-07-09T22:15:54Z"
+    lastUpdateTime: "2019-07-09T22:15:54Z"
+    message: MPIJob default/tensorflow-benchmarks is running.
+    reason: MPIJobRunning
+    status: "False"
+    type: Running
+  - lastTransitionTime: "2019-07-09T22:17:06Z"
+    lastUpdateTime: "2019-07-09T22:17:06Z"
+    message: MPIJob default/tensorflow-benchmarks successfully completed.
+    reason: MPIJobSucceeded
+    status: "True"
+    type: Succeeded
+  replicaStatuses:
+    Launcher:
+      succeeded: 1
+    Worker: {}
+  startTime: "2019-07-09T22:15:51Z"
 ```
 
 
 Training should run for 100 steps and takes a few minutes on a GPU cluster. You can inspect the logs to see the training progress. When the job starts, access the logs from the `launcher` pod:
 
 ```
-PODNAME=$(kubectl get pods -l mpi_job_name=tensorflow-benchmarks-16,mpi_role_type=launcher -o name)
+PODNAME=$(kubectl get pods -l mpi_job_name=tensorflow-benchmarks,mpi_role_type=launcher -o name)
 kubectl logs -f ${PODNAME}
 ```
 
 ```
-TensorFlow:  1.10
+TensorFlow:  1.14
 Model:       resnet101
 Dataset:     imagenet (synthetic)
 Mode:        training
@@ -108,32 +169,29 @@ Batch size:  128 global
 Num batches: 100
 Num epochs:  0.01
 Devices:     ['horovod/gpu:0', 'horovod/gpu:1']
+NUMA bind:   False
 Data format: NCHW
 Optimizer:   sgd
 Variables:   horovod
 
 ...
 
-40	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.146
-40	images/sec: 132.1 +/- 0.0 (jitter = 0.1)	9.182
-50	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.071
-50	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.210
-60	images/sec: 132.2 +/- 0.0 (jitter = 0.2)	9.180
-60	images/sec: 132.2 +/- 0.0 (jitter = 0.2)	9.055
-70	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.005
-70	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.096
-80	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.231
-80	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.197
-90	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.201
-90	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.089
-100	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.183
-----------------------------------------------------------------
-total images/sec: 264.26
-----------------------------------------------------------------
-100	images/sec: 132.1 +/- 0.0 (jitter = 0.2)	9.044
-----------------------------------------------------------------
-total images/sec: 264.26
+40	images/sec: 154.4 +/- 0.7 (jitter = 4.0)	8.280
+40	images/sec: 154.4 +/- 0.7 (jitter = 4.1)	8.482
+50	images/sec: 154.8 +/- 0.6 (jitter = 4.0)	8.397
+50	images/sec: 154.8 +/- 0.6 (jitter = 4.2)	8.450
+60	images/sec: 154.5 +/- 0.5 (jitter = 4.1)	8.321
+60	images/sec: 154.5 +/- 0.5 (jitter = 4.4)	8.349
+70	images/sec: 154.5 +/- 0.5 (jitter = 4.0)	8.433
+70	images/sec: 154.5 +/- 0.5 (jitter = 4.4)	8.430
+80	images/sec: 154.8 +/- 0.4 (jitter = 3.6)	8.199
+80	images/sec: 154.8 +/- 0.4 (jitter = 3.8)	8.404
+90	images/sec: 154.6 +/- 0.4 (jitter = 3.7)	8.418
+90	images/sec: 154.6 +/- 0.4 (jitter = 3.6)	8.459
+100	images/sec: 154.2 +/- 0.4 (jitter = 4.0)	8.372
+100	images/sec: 154.2 +/- 0.4 (jitter = 4.0)	8.542
 ----------------------------------------------------------------
+total images/sec: 308.27
 ```
 
 # Docker Images

diff --git a/cmd/kubectl-delivery/Dockerfile b/cmd/kubectl-delivery/Dockerfile
@@ -1,13 +1,13 @@
-FROM alpine:3.8 AS build
+FROM alpine:3.10 AS build
 
 # Install kubectl.
-ENV K8S_VERSION v1.13.2
+ENV K8S_VERSION v1.15.0
 RUN apk add --no-cache wget
 RUN wget -q https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl
 RUN chmod +x ./kubectl
 RUN mv ./kubectl /bin/kubectl
 
-FROM alpine:3.8
+FROM alpine:3.10
 COPY --from=build /bin/kubectl /bin/kubectl
 COPY deliver_kubectl.sh .
 ENTRYPOINT ["./deliver_kubectl.sh"]
diff --git a/cmd/mpi-operator.v1alpha2/app/options/options.go b/cmd/mpi-operator.v1alpha2/app/options/options.go
@@ -50,8 +50,8 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) {
 		"The container image used to deliver the kubectl binary.")
 
 	fs.StringVar(&s.Namespace, "namespace", v1.NamespaceAll,
-		`The namespace to monitor tfjobs. If unset, it monitors all namespaces cluster-wide. 
-                If set, it only monitors tfjobs in the given namespace.`)
+		`The namespace to monitor mpijobs. If unset, it monitors all namespaces cluster-wide. 
+                If set, it only monitors mpijobs in the given namespace.`)
 
 	fs.IntVar(&s.Threadiness, "threadiness", 2,
 		`How many threads to process the main logic`)

diff --git a/deploy/2-rbac.yaml b/deploy/2-rbac.yaml
@@ -26,6 +26,14 @@ rules:
   - pods/exec
   verbs:
   - create
+- apiGroups:
+  - ""
+  resources:
+  - endpoints
+  verbs:
+  - create
+  - get
+  - update
 - apiGroups:
   - ""
   resources:
@@ -80,6 +88,7 @@ rules:
   - kubeflow.org
   resources:
   - mpijobs
+  - mpijobs/status
   verbs:
   - "*"
 ---

diff --git a/deploy/3-mpi-operator.yaml b/deploy/3-mpi-operator.yaml
@@ -21,7 +21,6 @@ spec:
         image: mpioperator/mpi-operator:latest
         args: [
           "-alsologtostderr",
-          "--gpus-per-node", "8",
           "--kubectl-delivery-image",
           "mpioperator/kubectl-delivery:latest"
         ]

diff --git a/examples/tensorflow-benchmarks/Dockerfile b/examples/tensorflow-benchmarks/Dockerfile
@@ -1,16 +1,8 @@
-FROM uber/horovod:0.15.2-tf1.12.0-torch1.0.0-py2.7
-
-# Temporary fix until Horovod pushes out a new release.
-# See https://github.com/uber/horovod/pull/700
-RUN sed -i '/^NCCL_SOCKET_IFNAME.*/d' /etc/nccl.conf
+FROM horovod/horovod:0.16.4-tf1.14.0-torch1.1.0-mxnet1.4.1-py3.6
 
 RUN mkdir /tensorflow
 WORKDIR "/tensorflow"
-RUN git clone -b cnn_tf_v1.12_compatible https://github.com/tensorflow/benchmarks
+RUN git clone https://github.com/tensorflow/benchmarks
 WORKDIR "/tensorflow/benchmarks"
 
-CMD mpirun \
-  python scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \
-    --model resnet101 \
-    --batch_size 64 \
-    --variable_update horovod
+CMD ["/bin/bash"]
diff --git a/examples/tensorflow-benchmarks-custom.yaml → ...1alpha1/tensorflow-benchmarks-custom.yaml b/examples/tensorflow-benchmarks-custom.yaml → ...1alpha1/tensorflow-benchmarks-custom.yaml
diff --git a/examples/tensorflow-benchmarks-imagenet.yaml → ...lpha1/tensorflow-benchmarks-imagenet.yaml b/examples/tensorflow-benchmarks-imagenet.yaml → ...lpha1/tensorflow-benchmarks-imagenet.yaml
diff --git a/examples/tensorflow-benchmarks.yaml → examples/v1alpha1/tensorflow-benchmarks.yaml b/examples/tensorflow-benchmarks.yaml → examples/v1alpha1/tensorflow-benchmarks.yaml
diff --git a/examples/v1alpha2/tensorflow-benchmarks-custom.yaml b/examples/v1alpha2/tensorflow-benchmarks-custom.yaml
diff --git a/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml b/examples/v1alpha2/tensorflow-benchmarks-imagenet.yaml
@@ -8,6 +8,8 @@ kind: MPIJob
 metadata:
   name: tensorflow-benchmarks-imagenet
 spec:
+  slotsPerWorker: 8
+  cleanPodPolicy: Running
   mpiReplicaSpecs:
     Launcher:
       replicas: 1
@@ -18,6 +20,25 @@ spec:
             name: tensorflow-benchmarks
             command:
               - mpirun
+              - --allow-run-as-root
+              - -np
+              - "16"
+              - -bind-to
+              - none
+              - -map-by
+              - slot
+              - -x
+              - NCCL_DEBUG=INFO
+              - -x
+              - LD_LIBRARY_PATH
+              - -x
+              - PATH
+              - -mca
+              - pml
+              - ob1
+              - -mca
+              - btl
+              - ^openib
               - python
               - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
               - --data_format=NCHW
@@ -39,6 +60,9 @@ spec:
           containers:
           - image: mpioperator/tensorflow-benchmarks:latest
             name: tensorflow-benchmarks
+            resources:
+              limits:
+                nvidia.com/gpu: 8
             volumeMounts:
             - mountPath: /efs
               name: efs