I1211 00:33:29.538427 1 main.go:69] Loading controller config from /etc/config/controller_config_file.yaml. I1211 00:33:29.539012 1 main.go:81] ControllerConfig: { "Accelerators": { "alpha.kubernetes.io/nvidia-gpu": { "Volumes": [ { "Name": "nvidia-libraries", "HostPath": "/home/kubernetes/bin/nvidia/lib", "MountPath": "/usr/local/nvidia/lib64" }, { "Name": "nvidia-debug-tools", "HostPath": "/home/kubernetes/bin/nvidia/bin", "MountPath": "/usr/local/bin/nvidia" } ], "EnvVars": null } }, "GrpcServerFilePath": "/opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py" } I1211 00:33:29.539679 1 main.go:113] tf_operator Version: 0.3.0+git I1211 00:33:29.539696 1 main.go:114] Git SHA: Not provided. I1211 00:33:29.539715 1 main.go:115] Go Version: go1.8.2 I1211 00:33:29.539718 1 main.go:116] Go OS/Arch: linux/amd64 E1211 00:33:29.541912 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:04.719773 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: i/o timeout E1211 00:34:26.129687 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:31.523776 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:36.100788 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:40.630832 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:46.104993 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused E1211 00:34:49.342716 1 election.go:226] error retrieving resource lock default/tf-operator: Get https://10.27.240.1:443/api/v1/namespaces/default/endpoints/tf-operator: dial tcp 10.27.240.1:443: getsockopt: connection refused I1211 00:34:52.983025 1 election.go:186] successfully acquired lease default/tf-operator I1211 00:34:53.498879 1 controller.go:173] finding existing jobs... I1211 00:34:53.507131 1 controller.go:89] Starting watch at version %v3124039 I1211 00:34:53.507147 1 controller.go:98] starts running from watch version: 3124039 I1211 00:34:53.507207 1 training.go:414] start running... I1211 00:34:53.508168 1 controller.go:312] start watching at 3124039 I1211 00:35:01.507536 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:01.524084 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:01.524251 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:01.599059 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:01.599085 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:01.635674 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:01.635742 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:01.639108 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:01.639132 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:01.648081 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:01.648128 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:01.651103 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:01.651121 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:01.660007 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:01.660033 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:01.670346 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:01.676943 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124063" }, "items": [] } I1211 00:35:01.711694 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124063" }, "items": [] } I1211 00:35:01.716911 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124063" }, "items": [] } I1211 00:35:01.716953 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:09.717126 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:09.724907 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:09.724977 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:09.745889 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:09.745933 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:09.751075 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:09.751174 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:09.772035 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:09.772057 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:09.779265 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:09.779338 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:09.783042 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:09.783063 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:09.790986 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:09.791030 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:09.794353 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:09.800631 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124101" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:09.807269 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124101" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:09.844011 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124101" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124083", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:09.844074 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:17.912723 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:19.082398 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:19.130863 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:19.150122 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:19.150146 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:19.159304 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:19.159366 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:19.162323 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:19.162341 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:19.170137 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:19.170187 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:19.173242 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:19.173256 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:19.183850 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:19.183878 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:19.220620 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:19.400633 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124126" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:19.447799 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124126" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:19.455617 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124126" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124083", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:19.455695 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:27.455884 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:27.461504 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:27.461572 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:27.464449 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:27.464472 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:27.470145 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:27.470241 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:27.472676 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:27.472697 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:27.478258 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:27.478299 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:27.480742 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:27.480760 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:27.485132 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:27.485163 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:27.487797 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:27.493458 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124148" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:27.498472 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124148" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:27.515638 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124148" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124083", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:27.515689 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:35.515911 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:35.523892 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:35.523952 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:35.526590 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:35.526611 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:35.531971 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:35.532014 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:35.534563 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:35.534580 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:35.540041 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:35.540102 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:35.542613 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:35.542636 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:35.547131 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:35.547160 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:35.549842 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:35.639611 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124174" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:35.645463 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124174" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:35.651060 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124174" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:35:35.651102 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:43.651258 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:43.656451 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:43.656506 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:43.659073 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:43.659095 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:43.663492 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:43.663538 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:43.666266 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:43.666287 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:43.670477 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:43.670528 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:43.672827 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:43.672848 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:43.678072 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:43.678099 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:43.680915 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:43.687253 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124198" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:43.692538 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124198" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:43.698023 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124198" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:35:43.698073 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:51.698263 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:51.705313 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:51.705382 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:51.708077 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:51.708098 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:51.713605 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:51.713657 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:51.716054 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:51.716080 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:51.721406 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:51.721468 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:51.723872 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:51.723895 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:51.728768 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:51.728795 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:51.731275 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:51.736769 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124218" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:51.742197 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124218" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:51.747196 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124218" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:35:51.747261 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:35:59.747468 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:35:59.755040 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:35:59.755131 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:35:59.758874 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:35:59.758898 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:35:59.765166 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:35:59.765259 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:35:59.770844 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:35:59.770872 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:35:59.776999 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:35:59.777083 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:35:59.785957 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:35:59.786004 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:35:59.798931 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:35:59.798961 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:35:59.803056 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:35:59.809770 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124246" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:35:59.886782 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124246" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:35:59.893618 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124246" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:35:59.893797 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:07.893984 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:07.904089 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:07.904144 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:07.906571 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:07.906596 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:07.913186 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:07.913246 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:07.915601 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:07.915621 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:07.920617 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:07.920665 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:07.922985 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:07.923008 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:07.927159 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:07.927189 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:07.930141 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:07.935631 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124269" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:07.940724 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124269" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:36:07.945884 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124269" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:07.945936 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:15.946055 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:15.953289 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:15.953359 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:15.956247 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:15.956265 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:15.963437 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:15.963487 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:15.966258 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:15.966281 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:15.971957 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:15.972013 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:15.974504 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:15.974536 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:15.979333 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:15.979356 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:15.982496 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:15.988387 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124292" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:15.995801 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124292" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:36:16.019787 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124292" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:16.019862 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:24.020128 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:24.026772 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:24.026836 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:24.029508 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:24.029531 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:24.036190 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:24.036260 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:24.039373 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:24.039396 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:24.098133 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:24.098181 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:24.102437 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:24.102465 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:24.109709 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:24.109736 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:24.113051 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:24.119935 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124316" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:24.126630 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124316" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:36:24.133247 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124316" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:24.133298 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:32.133544 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:32.139200 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:32.139302 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:32.141919 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:32.141944 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:32.146379 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:32.146429 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:32.149043 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:32.149070 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:32.153729 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:32.153778 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:32.156335 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:32.156360 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:32.160638 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:32.160663 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:32.163167 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:32.168576 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124337" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:32.232042 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124337" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124084", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 00:36:32.275534 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124337" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:32.275585 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:39.955778 1 controller.go:319] apiserver closed stream I1211 00:36:39.980704 1 controller.go:312] start watching at 3124039 I1211 00:36:40.275889 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:40.284246 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:40.284358 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:40.287144 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:40.287185 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:40.291770 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:40.291843 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:40.324244 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:40.324265 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:40.329084 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:40.329125 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:40.331590 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:40.331627 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:40.336105 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:40.336130 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:40.338735 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:40.344102 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124363" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:40.349688 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124363" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124344", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:36:34Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:36:34Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } ], "qosClass": "Burstable" } } ] } I1211 00:36:40.354725 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124363" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:40.354790 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:48.536042 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:48.542357 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:48.542431 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:48.757022 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:48.757058 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:48.761590 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:48.761648 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:48.764292 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:48.764317 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:48.768458 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:48.768511 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:48.771115 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:48.771137 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:48.775959 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:48.775988 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:48.778810 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:49.032960 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124384" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:49.074451 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124385" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124344", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:36:34Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:36:34Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } ], "qosClass": "Burstable" } } ] } I1211 00:36:49.099755 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124385" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:49.099860 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:36:57.100102 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:36:57.106018 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:36:57.106085 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:36:57.108725 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:36:57.108746 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:36:57.113026 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:36:57.113072 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:36:57.115368 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:36:57.115397 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:36:57.119868 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:36:57.119953 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:36:57.122503 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:36:57.122526 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:36:57.128030 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:36:57.128072 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:36:57.130607 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:36:57.135825 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124411" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:36:57.141110 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124411" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124344", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:36:34Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:36:34Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } ], "qosClass": "Burstable" } } ] } I1211 00:36:57.146252 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124411" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:36:57.146306 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:37:05.146540 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:37:05.152288 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:37:05.152366 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:37:05.155091 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:37:05.155113 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:37:05.159597 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:37:05.159652 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:37:05.162080 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:37:05.162096 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:37:05.166360 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:37:05.166410 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:37:05.168745 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:37:05.168779 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:37:05.174123 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:37:05.174148 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:37:05.176680 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:37:05.182316 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124436" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:37:05.187884 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124436" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124344", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:36:34Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:36:34Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } ], "qosClass": "Burstable" } } ] } I1211 00:37:05.208692 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124436" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:37:05.208767 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:37:13.209050 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:37:13.222477 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:37:13.222543 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:37:13.225866 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:37:13.225898 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:37:13.232459 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:37:13.232617 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:37:13.236025 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:37:13.236045 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:37:13.253628 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:37:13.253674 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:37:13.258818 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:37:13.258839 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:37:13.264394 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:37:13.264434 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:37:13.267043 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:37:13.273010 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124458" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:37:13.280515 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124458" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124344", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:36:34Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:36:34Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } ], "qosClass": "Burstable" } } ] } I1211 00:37:13.285899 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124458" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:37:13.285946 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:37:21.286368 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:37:21.292455 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:37:21.292559 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:37:21.296373 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:37:21.296391 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:37:21.301407 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:37:21.301446 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:37:21.304369 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:37:21.304400 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:37:21.309395 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:37:21.309455 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:37:21.312333 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:37:21.312346 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:37:21.325387 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:37:21.325404 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:37:21.329324 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:37:21.335729 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124485" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:37:21.342777 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124485" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124475", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:37:20Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:37:20Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T00:36:34Z", "finishedAt": "2017-12-11T00:37:18Z", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://57a1db6c33983e55526e63503a9cc8206db2201b3e7ca3705c2fcc11dd7a052e" } ], "qosClass": "Burstable" } } ] } I1211 00:37:21.443806 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124485" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:37:21.443846 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:37:21.472342 1 controller.go:349] event: MODIFIED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 00:37:21.472444 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 00:37:29.454891 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:37:29.519497 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:37:29.519599 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:37:29.528999 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:37:29.529031 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:37:29.535963 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:37:29.536025 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:37:29.556404 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:37:29.556427 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:37:29.562128 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:37:29.562173 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:37:29.567143 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:37:29.567164 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:37:29.573342 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:37:29.573386 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:37:29.577858 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:37:29.586582 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124508" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124088", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:08Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:07Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } ], "qosClass": "Burstable" } } ] } I1211 00:37:29.593460 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124508" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124475", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:37:20Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:37:20Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T00:36:34Z", "finishedAt": "2017-12-11T00:37:18Z", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://57a1db6c33983e55526e63503a9cc8206db2201b3e7ca3705c2fcc11dd7a052e" } ], "qosClass": "Burstable" } } ] } I1211 00:37:29.624633 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124508" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } I1211 00:37:29.624681 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 00:37:37.624901 1 replicas.go:176] Creating Service: train-dv-master-zbax-0 I1211 00:37:37.630319 1 replicas.go:182] Service train-dv-master-zbax-0 already exists. I1211 00:37:37.630391 1 replicas.go:257] Creating Job: train-dv-master-zbax-0 I1211 00:37:37.633027 1 replicas.go:263] train-dv-master-zbax-0 already exists. I1211 00:37:37.633051 1 replicas.go:176] Creating Service: train-dv-worker-zbax-0 I1211 00:37:37.637517 1 replicas.go:182] Service train-dv-worker-zbax-0 already exists. I1211 00:37:37.637569 1 replicas.go:257] Creating Job: train-dv-worker-zbax-0 I1211 00:37:37.639928 1 replicas.go:263] train-dv-worker-zbax-0 already exists. I1211 00:37:37.639951 1 replicas.go:176] Creating Service: train-dv-ps-zbax-0 I1211 00:37:37.644271 1 replicas.go:182] Service train-dv-ps-zbax-0 already exists. I1211 00:37:37.644321 1 replicas.go:257] Creating Job: train-dv-ps-zbax-0 I1211 00:37:37.646630 1 replicas.go:263] train-dv-ps-zbax-0 already exists. I1211 00:37:37.646651 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-zbax I1211 00:37:37.651863 1 tensorboard.go:78] Service train-dv-tensorboard-zbax already exists. I1211 00:37:37.651893 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-zbax I1211 00:37:37.654298 1 tensorboard.go:106] train-dv-tensorboard-zbax already exists. I1211 00:37:37.684780 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124535" }, "items": [ { "metadata": { "name": "train-dv-master-zbax-0-wtd7b", "generateName": "train-dv-master-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-zbax-0-wtd7b", "uid": "22be3644-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124519", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-master-zbax-0", "job_type": "MASTER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-zbax-0\",\"uid\":\"761cd4fe-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123873\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-zbax-0", "uid": "761cd4fe-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-cgjd", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:37:33Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.48", "podIP": "10.24.1.46", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:37:33Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T00:35:07Z", "finishedAt": "2017-12-11T00:37:32Z", "containerID": "docker://983b909d56f9a202a40edf72dc738e7c4296f471031f7b2414836d8419f16709" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://1c99faf22ab7433a2ec5f961d3beb1526b20eca18379806155ab424eac800141" } ], "qosClass": "Burstable" } } ] } I1211 00:37:37.754829 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124535" }, "items": [ { "metadata": { "name": "train-dv-worker-zbax-0-bq6bx", "generateName": "train-dv-worker-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-zbax-0-bq6bx", "uid": "22be7857-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124475", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "7629ceca-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-worker-zbax-0", "job_type": "WORKER", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-zbax-0\",\"uid\":\"7629ceca-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123872\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-zbax-0", "uid": "7629ceca-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-fqf1", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:37:20Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.24", "podIP": "10.24.3.22", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:37:20Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T00:36:34Z", "finishedAt": "2017-12-11T00:37:18Z", "containerID": "docker://027ca514ae0d8a5d8165951e015638746218526c6737a122554df6fa88054e1e" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://57a1db6c33983e55526e63503a9cc8206db2201b3e7ca3705c2fcc11dd7a052e" } ], "qosClass": "Burstable" } } ] } I1211 00:37:37.770057 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3124535" }, "items": [ { "metadata": { "name": "train-dv-ps-zbax-0-xc7gx", "generateName": "train-dv-ps-zbax-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-zbax-0-xc7gx", "uid": "22be98f0-de0b-11e7-b6b9-42010af0014d", "resourceVersion": "3124157", "creationTimestamp": "2017-12-11T00:35:06Z", "labels": { "controller-uid": "762de417-de09-11e7-b9be-42010af001e8", "job-name": "train-dv-ps-zbax-0", "job_type": "PS", "runtime_id": "zbax", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-zbax-0\",\"uid\":\"762de417-de09-11e7-b9be-42010af001e8\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3123875\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-zbax-0", "uid": "762de417-de09-11e7-b9be-42010af001e8", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-zbax-0:2222\"],\"ps\":[\"train-dv-ps-zbax-0:2222\"],\"worker\":[\"train-dv-worker-zbax-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:32Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T00:35:06Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.19", "startTime": "2017-12-11T00:35:06Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T00:35:32Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://ccb669a02dc34c3a2f5e82611b2faa7c556c3485fccee96e5d99661ed276dc2a" } ], "qosClass": "Burstable" } } ] } E1211 00:37:37.770108 1 training.go:469] Master failed Job: train-dv. I1211 00:37:37.779582 1 controller.go:349] event: MODIFIED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 00:37:37.779730 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } W1211 00:37:37.780063 1 controller.go:115] fail to handle event: { "Type": "MODIFIED", "Object": { "kind": "TfJob", "apiVersion": "tensorflow.org/v1alpha1", "metadata": { "name": "train-dv", "namespace": "default", "selfLink": "/apis/tensorflow.org/v1alpha1/namespaces/default/tfjobs/train-dv", "uid": "4162dcc0-de09-11e7-b9be-42010af001e8", "resourceVersion": "3124536", "creationTimestamp": "2017-12-11T00:21:39Z" }, "spec": { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" }, "status": { "phase": "Done", "reason": "", "controlPaused": false, "conditions": null, "state": "Failed", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "WORKER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } } }, error ignore failed TfJob (train-dv). Please delete its CRD I1211 00:38:17.256847 1 controller.go:319] apiserver closed stream I1211 00:38:17.259168 1 controller.go:312] start watching at 3124536 I1211 00:39:40.282000 1 controller.go:319] apiserver closed stream I1211 00:39:40.283292 1 controller.go:312] start watching at 3124536 I1211 00:41:17.569302 1 controller.go:319] apiserver closed stream I1211 00:41:17.574056 1 controller.go:312] start watching at 3124536 I1211 00:41:18.098112 1 controller.go:173] finding existing jobs... I1211 00:41:18.100460 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 00:41:18.100478 1 controller.go:89] Starting watch at version %v3124982 I1211 00:41:18.100482 1 controller.go:98] starts running from watch version: 3124982 I1211 00:41:18.101400 1 controller.go:312] start watching at 3124982 I1211 00:42:59.766537 1 controller.go:319] apiserver closed stream I1211 00:42:59.767819 1 controller.go:312] start watching at 3124982 I1211 00:44:58.726019 1 controller.go:319] apiserver closed stream I1211 00:44:58.730819 1 controller.go:312] start watching at 3124982 I1211 00:44:59.239194 1 controller.go:173] finding existing jobs... I1211 00:44:59.241544 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 00:44:59.241557 1 controller.go:89] Starting watch at version %v3125418 I1211 00:44:59.241560 1 controller.go:98] starts running from watch version: 3125418 I1211 00:44:59.242570 1 controller.go:312] start watching at 3125418 I1211 00:46:12.903034 1 controller.go:319] apiserver closed stream I1211 00:46:12.905066 1 controller.go:312] start watching at 3125418 I1211 00:47:58.273101 1 controller.go:319] apiserver closed stream I1211 00:47:58.274153 1 controller.go:312] start watching at 3125418 I1211 00:49:51.687025 1 controller.go:319] apiserver closed stream I1211 00:49:51.690956 1 controller.go:312] start watching at 3125418 I1211 00:49:52.200347 1 controller.go:173] finding existing jobs... I1211 00:49:52.202804 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 00:49:52.202819 1 controller.go:89] Starting watch at version %v3125988 I1211 00:49:52.202822 1 controller.go:98] starts running from watch version: 3125988 I1211 00:49:52.203769 1 controller.go:312] start watching at 3125988 I1211 00:51:47.190548 1 controller.go:319] apiserver closed stream I1211 00:51:47.191372 1 controller.go:312] start watching at 3125988 I1211 00:53:30.883790 1 controller.go:319] apiserver closed stream I1211 00:53:30.884992 1 controller.go:312] start watching at 3125988 I1211 00:55:13.338075 1 controller.go:319] apiserver closed stream I1211 00:55:13.358854 1 controller.go:312] start watching at 3125988 I1211 00:55:13.876710 1 controller.go:173] finding existing jobs... I1211 00:55:13.879234 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 00:55:13.879249 1 controller.go:89] Starting watch at version %v3126613 I1211 00:55:13.879253 1 controller.go:98] starts running from watch version: 3126613 I1211 00:55:13.880123 1 controller.go:312] start watching at 3126613 I1211 00:56:46.269240 1 controller.go:319] apiserver closed stream I1211 00:56:46.275331 1 controller.go:312] start watching at 3126613 I1211 00:57:56.401342 1 controller.go:319] apiserver closed stream I1211 00:57:56.402508 1 controller.go:312] start watching at 3126613 I1211 00:59:00.319543 1 controller.go:319] apiserver closed stream I1211 00:59:00.340150 1 controller.go:312] start watching at 3126613 I1211 01:00:28.844141 1 controller.go:349] event: DELETED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:28.844292 1 controller.go:350] TfJob event: DELETED { "RuntimeId": "zbax", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:31.718094 1 controller.go:349] event: ADDED { "RuntimeId": "", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ] } I1211 01:00:31.718242 1 controller.go:350] TfJob event: ADDED { "RuntimeId": "", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ] } I1211 01:00:31.749161 1 controller.go:349] event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:31.749484 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:31.749680 1 training.go:346] Creating job: train-dv with Spec (spec.TfJobSpec{RuntimeId:"in2d", TensorBoard:(*spec.TensorBoardSpec)(0xc4205f21e0), ReplicaSpecs:[]*spec.TfReplicaSpec{(*spec.TfReplicaSpec)(0xc4203b5e60), (*spec.TfReplicaSpec)(0xc4203b5f20), (*spec.TfReplicaSpec)(0xc42016e090)}, TfImage:"tensorflow/tensorflow:1.3.0"}), Status (spec.TfJobStatus{Phase:"Creating", Reason:"", ControlPaused:false, Conditions:[]spec.TfJobCondition(nil), State:"", ReplicaStatuses:[]*spec.TfReplicaStatus(nil)}) I1211 01:00:31.754397 1 controller.go:349] event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:31.754494 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:31.756552 1 training.go:414] start running... I1211 01:00:39.756775 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:00:39.767402 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:00:39.771836 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:00:39.780443 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:00:39.794558 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:00:39.820617 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:00:39.833763 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:00:39.866276 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:00:39.896669 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127307" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127297", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:00:39.925969 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127312" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:00:39.944908 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127315" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127306", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:00:39.945001 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:00:39.951628 1 controller.go:349] event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:39.951823 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:00:47.952271 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:00:47.959658 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:00:47.959736 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:00:47.962648 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:00:47.962673 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:00:47.966984 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:00:47.967033 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:00:47.969954 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:00:47.969978 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:00:47.974551 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:00:47.974607 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:00:47.977111 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:00:47.977171 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:00:47.981691 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:00:47.981718 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:00:47.984721 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:00:47.991549 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127344" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:00:47.997317 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127344" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:00:48.002918 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127344" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:00:48.002970 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:00:49.294756 1 controller.go:319] apiserver closed stream I1211 01:00:49.295689 1 controller.go:312] start watching at 3127316 I1211 01:00:56.003247 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:00:56.014236 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:00:56.014302 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:00:56.017698 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:00:56.017720 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:00:56.026597 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:00:56.026647 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:00:56.029356 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:00:56.029379 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:00:56.035011 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:00:56.035063 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:00:56.037999 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:00:56.038023 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:00:56.093034 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:00:56.093060 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:00:56.096663 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:00:56.103453 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127364" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:00:56.109010 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127364" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:00:56.114934 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127364" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:00:56.114997 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:04.115278 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:04.121684 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:04.121759 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:04.124349 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:04.124374 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:04.129132 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:04.129183 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:04.131690 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:04.131713 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:04.136208 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:04.136272 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:04.138688 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:04.138709 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:04.143126 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:04.143151 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:04.145898 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:04.151802 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127396" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:04.158778 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127396" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:04.164146 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127396" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:04.164204 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:12.164462 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:12.176350 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:12.176452 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:12.179753 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:12.179792 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:12.185851 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:12.185925 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:12.188900 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:12.188926 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:12.195172 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:12.195244 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:12.198458 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:12.198480 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:12.204328 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:12.204363 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:12.207820 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:12.215239 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127423" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:12.221388 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127423" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:12.228619 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127423" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:12.228674 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:20.228960 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:20.239782 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:20.239897 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:20.245513 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:20.245537 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:20.254374 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:20.254426 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:20.260865 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:20.260905 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:20.267345 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:20.267406 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:20.270104 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:20.270129 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:20.274358 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:20.274397 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:20.276937 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:20.282937 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127449" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:20.288361 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127449" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:20.293833 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127449" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:20.293920 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:28.294178 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:28.301476 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:28.301539 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:28.304109 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:28.304133 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:28.308804 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:28.308853 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:28.311446 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:28.311465 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:28.316428 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:28.316489 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:28.318950 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:28.318974 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:28.323596 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:28.323622 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:28.326360 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:28.332493 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127472" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:28.337880 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127472" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:28.343452 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127472" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:28.343528 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:36.343781 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:36.349692 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:36.349760 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:36.352464 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:36.352486 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:36.356774 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:36.356842 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:36.359270 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:36.359293 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:36.363800 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:36.363855 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:36.366191 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:36.366226 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:36.376049 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:36.376081 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:36.378918 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:36.384836 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127494" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:36.390647 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127494" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:36.396937 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127494" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:36.396989 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:44.397278 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:44.405198 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:44.405282 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:44.408260 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:44.408283 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:44.412912 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:44.412963 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:44.415754 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:44.415785 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:44.420643 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:44.420705 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:44.423606 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:44.423635 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:44.428615 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:44.428641 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:44.431354 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:44.438732 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127518" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:44.445158 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127518" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:44.451012 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127518" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:44.451062 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:52.451386 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:01:52.458294 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:01:52.458367 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:01:52.461134 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:01:52.461158 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:01:52.465462 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:01:52.465509 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:01:52.468184 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:01:52.468206 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:01:52.472820 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:01:52.472866 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:01:52.475470 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:01:52.475493 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:01:52.479739 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:01:52.479764 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:01:52.482528 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:01:52.488387 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127540" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:01:52.493965 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127540" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:01:52.499547 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127540" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:01:52.499596 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:01:52.623624 1 controller.go:319] apiserver closed stream I1211 01:01:52.624798 1 controller.go:312] start watching at 3127316 I1211 01:02:00.499847 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:00.505761 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:00.505833 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:00.508606 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:00.508630 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:00.514381 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:00.514434 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:00.517312 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:00.517333 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:00.523950 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:00.523997 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:00.526881 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:00.526902 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:00.532439 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:00.532464 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:00.534767 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:00.540357 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127564" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:00.545753 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127564" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127299", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Pending", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "False", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z", "reason": "ContainersNotReady", "message": "containers with unready status: [tensorflow]" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "waiting": { "reason": "ContainerCreating" } }, "lastState": {}, "ready": false, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "" } ], "qosClass": "Burstable" } } ] } I1211 01:02:00.550881 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127564" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:00.550933 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:08.551195 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:08.557295 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:08.557365 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:08.559870 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:08.559892 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:08.564331 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:08.564382 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:08.566817 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:08.566843 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:08.571364 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:08.571414 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:08.573679 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:08.573704 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:08.579194 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:08.579236 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:08.581736 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:08.587330 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127592" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:08.593423 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127592" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:08.599207 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127592" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:08.599282 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:16.599496 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:16.612985 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:16.613080 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:16.616176 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:16.616206 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:16.621496 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:16.621545 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:16.624493 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:16.624516 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:16.630308 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:16.630370 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:16.633962 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:16.633999 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:16.641553 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:16.641579 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:16.644436 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:16.651433 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127614" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:16.656796 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127614" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:16.662444 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127614" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:16.662492 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:24.662735 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:24.670529 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:24.670590 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:24.673263 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:24.673286 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:24.677533 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:24.677577 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:24.681067 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:24.681087 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:24.686380 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:24.686426 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:24.688845 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:24.688868 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:24.694563 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:24.694590 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:24.698663 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:24.707628 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127639" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:24.713364 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127639" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:24.718707 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127639" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:24.718759 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:32.718976 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:32.725122 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:32.725193 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:32.727965 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:32.727989 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:32.735141 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:32.735200 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:32.739741 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:32.739775 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:32.744023 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:32.744073 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:32.746578 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:32.746624 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:32.752198 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:32.752282 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:32.754886 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:32.760615 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127662" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:32.765954 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127662" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:32.771491 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127662" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:32.771537 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:40.771776 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:40.777485 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:40.777620 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:40.780522 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:40.780547 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:40.784904 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:40.784955 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:40.787532 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:40.787554 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:40.792039 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:40.792110 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:40.794825 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:40.794857 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:40.799048 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:40.799101 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:40.801844 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:40.809830 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127685" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:40.815186 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127685" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:40.822387 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127685" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:40.822435 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:48.822763 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:48.831431 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:48.831500 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:48.834254 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:48.834276 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:48.844548 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:48.844630 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:48.850398 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:48.850424 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:48.854763 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:48.854810 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:48.857290 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:48.857316 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:48.861542 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:48.861568 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:48.864371 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:48.870708 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127713" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:48.876482 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127713" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:48.881940 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127713" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:48.881988 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:02:56.882265 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:02:56.891306 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:02:56.891383 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:02:56.894447 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:02:56.894470 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:02:56.904361 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:02:56.904426 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:02:56.907852 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:02:56.907875 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:02:56.932126 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:02:56.932176 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:02:56.965268 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:02:56.965290 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:02:56.989263 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:02:56.989293 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:02:56.992884 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:02:56.999299 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127735" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:02:57.006589 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127735" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:02:57.011880 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127735" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:02:57.011939 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:03:05.012170 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:03:05.018276 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:03:05.018349 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:03:05.021314 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:03:05.021346 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:03:05.034381 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:03:05.034425 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:03:05.039980 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:03:05.040004 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:03:05.045490 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:03:05.045539 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:03:05.047782 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:03:05.047813 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:03:05.052203 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:03:05.052246 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:03:05.054956 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:03:05.060621 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127756" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:03:05.066521 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127756" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:03:05.072785 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127756" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:03:05.072857 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:03:12.778368 1 controller.go:319] apiserver closed stream I1211 01:03:12.779689 1 controller.go:312] start watching at 3127316 I1211 01:03:13.073099 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:03:13.080536 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:03:13.080607 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:03:13.084724 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:03:13.084750 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:03:13.091263 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:03:13.091313 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:03:13.093927 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:03:13.093953 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:03:13.101104 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:03:13.101156 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:03:13.103937 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:03:13.103962 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:03:13.111317 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:03:13.111347 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:03:13.114460 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:03:13.120561 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127782" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127320", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } ], "qosClass": "Burstable" } } ] } I1211 01:03:13.126770 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127782" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127566", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:02:00Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:02:00Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } ], "qosClass": "Burstable" } } ] } I1211 01:03:13.133241 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127782" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } I1211 01:03:13.133298 1 training.go:477] Job train-dv status={ "phase": "Running", "reason": "", "controlPaused": false, "conditions": null, "state": "", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "WORKER", "state": "Running", "ReplicasStates": { "Running": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } I1211 01:03:21.133493 1 replicas.go:176] Creating Service: train-dv-master-in2d-0 I1211 01:03:21.140031 1 replicas.go:182] Service train-dv-master-in2d-0 already exists. I1211 01:03:21.140092 1 replicas.go:257] Creating Job: train-dv-master-in2d-0 I1211 01:03:21.142909 1 replicas.go:263] train-dv-master-in2d-0 already exists. I1211 01:03:21.142934 1 replicas.go:176] Creating Service: train-dv-worker-in2d-0 I1211 01:03:21.147459 1 replicas.go:182] Service train-dv-worker-in2d-0 already exists. I1211 01:03:21.147513 1 replicas.go:257] Creating Job: train-dv-worker-in2d-0 I1211 01:03:21.150103 1 replicas.go:263] train-dv-worker-in2d-0 already exists. I1211 01:03:21.150136 1 replicas.go:176] Creating Service: train-dv-ps-in2d-0 I1211 01:03:21.154476 1 replicas.go:182] Service train-dv-ps-in2d-0 already exists. I1211 01:03:21.154532 1 replicas.go:257] Creating Job: train-dv-ps-in2d-0 I1211 01:03:21.156923 1 replicas.go:263] train-dv-ps-in2d-0 already exists. I1211 01:03:21.156944 1 tensorboard.go:72] Creating Service: train-dv-tensorboard-in2d I1211 01:03:21.161697 1 tensorboard.go:78] Service train-dv-tensorboard-in2d already exists. I1211 01:03:21.161737 1 tensorboard.go:101] Creating Deployment: train-dv-tensorboard-in2d I1211 01:03:21.164320 1 tensorboard.go:106] train-dv-tensorboard-in2d already exists. I1211 01:03:21.170971 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127816" }, "items": [ { "metadata": { "name": "train-dv-master-in2d-0-c9k62", "generateName": "train-dv-master-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-master-in2d-0-c9k62", "uid": "b4889545-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127801", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-master-in2d-0", "job_type": "MASTER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-master-in2d-0\",\"uid\":\"b4873457-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127281\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-master-in2d-0", "uid": "b4873457-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"master\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:03:17Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.34", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:03:16Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T01:00:40Z", "finishedAt": "2017-12-11T01:03:15Z", "containerID": "docker://a16a8464b19b88016f36f16016d9870e59eb83ad211f2d697d99701bf8bbb659" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://e8ba3215d078594e12a0494fc6151102eb39e8c874726e3414d34b1f2aa8f71c" } ], "qosClass": "Burstable" } } ] } I1211 01:03:21.176407 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127816" }, "items": [ { "metadata": { "name": "train-dv-worker-in2d-0-764w4", "generateName": "train-dv-worker-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-worker-in2d-0-764w4", "uid": "b48b8d38-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127792", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-worker-in2d-0", "job_type": "WORKER", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-worker-in2d-0\",\"uid\":\"b48a3b2a-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127286\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-worker-in2d-0", "uid": "b48a3b2a-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"worker\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "limits": { "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", "nvidia.com/gpu": "1" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-sgb3", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:03:16Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.46", "podIP": "10.24.2.20", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:03:15Z" } }, "lastState": { "terminated": { "exitCode": 1, "reason": "Error", "startedAt": "2017-12-11T01:02:00Z", "finishedAt": "2017-12-11T01:03:14Z", "containerID": "docker://7f8c71676ccc724d95eaef9d1c781b7a2bc5a577845b1702840b07a8fd1b9362" } }, "ready": true, "restartCount": 1, "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant_gpu@sha256:3c3aea3870a526df02f09cd294735ca59242dc74fba7c28f2f624e4d048c3db6", "containerID": "docker://c40de8b120faf6593735b5e83c00f25cb79d05e3fa0cd748d9618efab2481716" } ], "qosClass": "Burstable" } } ] } I1211 01:03:21.181686 1 replicas.go:360] Get replicaStatus from PodList: { "metadata": { "selfLink": "/api/v1/namespaces/default/pods", "resourceVersion": "3127816" }, "items": [ { "metadata": { "name": "train-dv-ps-in2d-0-ss2mp", "generateName": "train-dv-ps-in2d-0-", "namespace": "default", "selfLink": "/api/v1/namespaces/default/pods/train-dv-ps-in2d-0-ss2mp", "uid": "b4919634-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127322", "creationTimestamp": "2017-12-11T01:00:39Z", "labels": { "controller-uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "job-name": "train-dv-ps-in2d-0", "job_type": "PS", "runtime_id": "in2d", "task_index": "0", "tensorflow.org": "", "tf_job_name": "train-dv" }, "annotations": { "kubernetes.io/created-by": "{\"kind\":\"SerializedReference\",\"apiVersion\":\"v1\",\"reference\":{\"kind\":\"Job\",\"namespace\":\"default\",\"name\":\"train-dv-ps-in2d-0\",\"uid\":\"b48fcdb3-de0e-11e7-b6b9-42010af0014d\",\"apiVersion\":\"batch\",\"resourceVersion\":\"3127296\"}}\n", "kubernetes.io/limit-ranger": "LimitRanger plugin set: cpu request for container tensorflow" }, "ownerReferences": [ { "apiVersion": "batch/v1", "kind": "Job", "name": "train-dv-ps-in2d-0", "uid": "b48fcdb3-de0e-11e7-b6b9-42010af0014d", "controller": true, "blockOwnerDeletion": true } ] }, "spec": { "volumes": [ { "name": "default-token-r22zm", "secret": { "secretName": "default-token-r22zm", "defaultMode": 420 } } ], "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "env": [ { "name": "TF_CONFIG", "value": "{\"cluster\":{\"master\":[\"train-dv-master-in2d-0:2222\"],\"ps\":[\"train-dv-ps-in2d-0:2222\"],\"worker\":[\"train-dv-worker-in2d-0:2222\"]},\"task\":{\"type\":\"ps\",\"index\":0},\"environment\":\"cloud\"}" } ], "resources": { "requests": { "cpu": "100m" } }, "volumeMounts": [ { "name": "default-token-r22zm", "readOnly": true, "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount" } ], "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent" } ], "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "serviceAccountName": "default", "serviceAccount": "default", "nodeName": "gke-gke-tf-example-default-pool-942f51a8-v9p6", "securityContext": {}, "schedulerName": "default-scheduler", "tolerations": [ { "key": "node.alpha.kubernetes.io/notReady", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 }, { "key": "node.alpha.kubernetes.io/unreachable", "operator": "Exists", "effect": "NoExecute", "tolerationSeconds": 300 } ] }, "status": { "phase": "Running", "conditions": [ { "type": "Initialized", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" }, { "type": "Ready", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:41Z" }, { "type": "PodScheduled", "status": "True", "lastProbeTime": null, "lastTransitionTime": "2017-12-11T01:00:39Z" } ], "hostIP": "10.240.0.47", "podIP": "10.24.4.35", "startTime": "2017-12-11T01:00:39Z", "containerStatuses": [ { "name": "tensorflow", "state": { "running": { "startedAt": "2017-12-11T01:00:40Z" } }, "lastState": {}, "ready": true, "restartCount": 0, "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "imageID": "docker-pullable://gcr.io/deepvariant-docker/deepvariant@sha256:72d3bd936dfbfbb707e648d7e6f0f8fb4318eb115aad0bfde9b43ff05fef8f19", "containerID": "docker://e2600b49f187783c8f36ed0fb970984416027e873ed77203a2acebef2503bf3c" } ], "qosClass": "Burstable" } } ] } E1211 01:03:21.181710 1 training.go:469] Master failed Job: train-dv. I1211 01:03:21.188824 1 controller.go:349] event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 01:03:21.188941 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } W1211 01:03:21.189131 1 controller.go:115] fail to handle event: { "Type": "MODIFIED", "Object": { "kind": "TfJob", "apiVersion": "tensorflow.org/v1alpha1", "metadata": { "name": "train-dv", "namespace": "default", "selfLink": "/apis/tensorflow.org/v1alpha1/namespaces/default/tfjobs/train-dv", "uid": "afb9cfd2-de0e-11e7-b6b9-42010af0014d", "resourceVersion": "3127817", "creationTimestamp": "2017-12-11T01:00:31Z" }, "spec": { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" }, "status": { "phase": "Done", "reason": "", "controlPaused": false, "conditions": null, "state": "Failed", "replicaStatuses": [ { "tf_replica_type": "MASTER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "WORKER", "state": "Failed", "ReplicasStates": { "Failed": 1 } }, { "tf_replica_type": "PS", "state": "Running", "ReplicasStates": { "Running": 1 } } ] } } }, error ignore failed TfJob (train-dv). Please delete its CRD I1211 01:04:24.505208 1 controller.go:319] apiserver closed stream I1211 01:04:24.506367 1 controller.go:312] start watching at 3127817 I1211 01:05:37.430120 1 controller.go:319] apiserver closed stream I1211 01:05:37.433381 1 controller.go:312] start watching at 3127817 I1211 01:05:37.942347 1 controller.go:173] finding existing jobs... I1211 01:05:37.944861 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:05:37.944878 1 controller.go:89] Starting watch at version %v3128096 I1211 01:05:37.944881 1 controller.go:98] starts running from watch version: 3128096 I1211 01:05:37.945796 1 controller.go:312] start watching at 3128096 I1211 01:07:25.286053 1 controller.go:319] apiserver closed stream I1211 01:07:25.288126 1 controller.go:312] start watching at 3128096 I1211 01:08:34.141373 1 controller.go:319] apiserver closed stream I1211 01:08:34.143476 1 controller.go:312] start watching at 3128096 I1211 01:09:51.309763 1 controller.go:319] apiserver closed stream I1211 01:09:51.313408 1 controller.go:312] start watching at 3128096 I1211 01:09:51.821796 1 controller.go:173] finding existing jobs... I1211 01:09:51.824366 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:09:51.824383 1 controller.go:89] Starting watch at version %v3128598 I1211 01:09:51.824387 1 controller.go:98] starts running from watch version: 3128598 I1211 01:09:51.825335 1 controller.go:312] start watching at 3128598 I1211 01:11:17.836186 1 controller.go:319] apiserver closed stream I1211 01:11:17.837346 1 controller.go:312] start watching at 3128598 I1211 01:12:18.577328 1 controller.go:319] apiserver closed stream I1211 01:12:18.578242 1 controller.go:312] start watching at 3128598 I1211 01:13:46.960458 1 controller.go:319] apiserver closed stream I1211 01:13:46.962878 1 controller.go:312] start watching at 3128598 I1211 01:15:06.558755 1 controller.go:319] apiserver closed stream I1211 01:15:06.562283 1 controller.go:312] start watching at 3128598 I1211 01:15:07.072032 1 controller.go:173] finding existing jobs... I1211 01:15:07.074206 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:15:07.074242 1 controller.go:89] Starting watch at version %v3129210 I1211 01:15:07.074245 1 controller.go:98] starts running from watch version: 3129210 I1211 01:15:07.075305 1 controller.go:312] start watching at 3129210 I1211 01:16:56.362461 1 controller.go:319] apiserver closed stream I1211 01:16:56.364724 1 controller.go:312] start watching at 3129210 I1211 01:18:27.186327 1 controller.go:319] apiserver closed stream I1211 01:18:27.187623 1 controller.go:312] start watching at 3129210 I1211 01:19:30.608482 1 controller.go:319] apiserver closed stream I1211 01:19:30.610899 1 controller.go:312] start watching at 3129210 I1211 01:21:15.670051 1 controller.go:319] apiserver closed stream I1211 01:21:15.674341 1 controller.go:312] start watching at 3129210 I1211 01:21:16.186925 1 controller.go:173] finding existing jobs... I1211 01:21:16.189676 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:21:16.189697 1 controller.go:89] Starting watch at version %v3129954 I1211 01:21:16.189700 1 controller.go:98] starts running from watch version: 3129954 I1211 01:21:16.190643 1 controller.go:312] start watching at 3129954 I1211 01:22:16.940479 1 controller.go:319] apiserver closed stream I1211 01:22:16.941562 1 controller.go:312] start watching at 3129954 I1211 01:23:29.597582 1 controller.go:319] apiserver closed stream I1211 01:23:29.598709 1 controller.go:312] start watching at 3129954 I1211 01:25:14.092189 1 controller.go:319] apiserver closed stream I1211 01:25:14.099285 1 controller.go:312] start watching at 3129954 I1211 01:25:14.615262 1 controller.go:173] finding existing jobs... I1211 01:25:14.618035 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:25:14.618054 1 controller.go:89] Starting watch at version %v3130423 I1211 01:25:14.618058 1 controller.go:98] starts running from watch version: 3130423 I1211 01:25:14.618990 1 controller.go:312] start watching at 3130423 I1211 01:27:14.337132 1 controller.go:319] apiserver closed stream I1211 01:27:14.338246 1 controller.go:312] start watching at 3130423 I1211 01:28:37.026182 1 controller.go:319] apiserver closed stream I1211 01:28:37.027454 1 controller.go:312] start watching at 3130423 I1211 01:29:57.682967 1 controller.go:319] apiserver closed stream I1211 01:29:57.686945 1 controller.go:312] start watching at 3130423 I1211 01:29:58.195789 1 controller.go:173] finding existing jobs... I1211 01:29:58.198008 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:29:58.198040 1 controller.go:89] Starting watch at version %v3130975 I1211 01:29:58.198043 1 controller.go:98] starts running from watch version: 3130975 I1211 01:29:58.198964 1 controller.go:312] start watching at 3130975 I1211 01:31:55.879413 1 controller.go:319] apiserver closed stream I1211 01:31:55.880395 1 controller.go:312] start watching at 3130975 I1211 01:33:18.505402 1 controller.go:319] apiserver closed stream I1211 01:33:18.506813 1 controller.go:312] start watching at 3130975 I1211 01:34:38.710787 1 controller.go:319] apiserver closed stream I1211 01:34:38.712941 1 controller.go:312] start watching at 3130975 I1211 01:35:40.980190 1 controller.go:319] apiserver closed stream I1211 01:35:40.984189 1 controller.go:312] start watching at 3130975 I1211 01:35:41.492649 1 controller.go:173] finding existing jobs... I1211 01:35:41.494673 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:35:41.494706 1 controller.go:89] Starting watch at version %v3131645 I1211 01:35:41.494709 1 controller.go:98] starts running from watch version: 3131645 I1211 01:35:41.495594 1 controller.go:312] start watching at 3131645 I1211 01:37:36.620044 1 controller.go:319] apiserver closed stream I1211 01:37:36.622706 1 controller.go:312] start watching at 3131645 I1211 01:38:58.448947 1 controller.go:319] apiserver closed stream I1211 01:38:58.451535 1 controller.go:312] start watching at 3131645 I1211 01:40:46.785891 1 controller.go:319] apiserver closed stream I1211 01:40:46.790481 1 controller.go:312] start watching at 3131645 I1211 01:40:47.319200 1 controller.go:173] finding existing jobs... I1211 01:40:47.321507 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:40:47.321542 1 controller.go:89] Starting watch at version %v3132236 I1211 01:40:47.321546 1 controller.go:98] starts running from watch version: 3132236 I1211 01:40:47.322448 1 controller.go:312] start watching at 3132236 I1211 01:42:30.664067 1 controller.go:319] apiserver closed stream I1211 01:42:30.665445 1 controller.go:312] start watching at 3132236 I1211 01:44:08.149262 1 controller.go:319] apiserver closed stream I1211 01:44:08.151681 1 controller.go:312] start watching at 3132236 I1211 01:45:43.140396 1 controller.go:319] apiserver closed stream I1211 01:45:43.145766 1 controller.go:312] start watching at 3132236 I1211 01:45:43.672926 1 controller.go:173] finding existing jobs... I1211 01:45:43.676349 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:45:43.676368 1 controller.go:89] Starting watch at version %v3132814 I1211 01:45:43.676372 1 controller.go:98] starts running from watch version: 3132814 I1211 01:45:43.677292 1 controller.go:312] start watching at 3132814 I1211 01:46:54.207933 1 controller.go:319] apiserver closed stream I1211 01:46:54.210769 1 controller.go:312] start watching at 3132814 I1211 01:48:46.486034 1 controller.go:319] apiserver closed stream I1211 01:48:46.487385 1 controller.go:312] start watching at 3132814 I1211 01:49:50.801264 1 controller.go:319] apiserver closed stream I1211 01:49:50.802451 1 controller.go:312] start watching at 3132814 I1211 01:51:42.831513 1 controller.go:319] apiserver closed stream I1211 01:51:42.836348 1 controller.go:312] start watching at 3132814 I1211 01:51:43.344666 1 controller.go:173] finding existing jobs... I1211 01:51:43.347092 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:51:43.347111 1 controller.go:89] Starting watch at version %v3133512 I1211 01:51:43.347114 1 controller.go:98] starts running from watch version: 3133512 I1211 01:51:43.347973 1 controller.go:312] start watching at 3133512 I1211 01:53:18.143678 1 controller.go:319] apiserver closed stream I1211 01:53:18.146010 1 controller.go:312] start watching at 3133512 I1211 01:55:03.406847 1 controller.go:319] apiserver closed stream I1211 01:55:03.410547 1 controller.go:312] start watching at 3133512 I1211 01:55:03.919023 1 controller.go:173] finding existing jobs... I1211 01:55:03.921964 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 01:55:03.921992 1 controller.go:89] Starting watch at version %v3133909 I1211 01:55:03.921995 1 controller.go:98] starts running from watch version: 3133909 I1211 01:55:03.923103 1 controller.go:312] start watching at 3133909 I1211 01:57:02.972279 1 controller.go:319] apiserver closed stream I1211 01:57:02.973749 1 controller.go:312] start watching at 3133909 I1211 01:58:52.496767 1 controller.go:319] apiserver closed stream I1211 01:58:52.498020 1 controller.go:312] start watching at 3133909 I1211 02:00:52.104595 1 controller.go:319] apiserver closed stream I1211 02:00:52.108601 1 controller.go:312] start watching at 3133909 I1211 02:00:52.617268 1 controller.go:173] finding existing jobs... I1211 02:00:52.619849 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:00:52.619881 1 controller.go:89] Starting watch at version %v3134590 I1211 02:00:52.619885 1 controller.go:98] starts running from watch version: 3134590 I1211 02:00:52.620779 1 controller.go:312] start watching at 3134590 I1211 02:02:47.334797 1 controller.go:319] apiserver closed stream I1211 02:02:47.336082 1 controller.go:312] start watching at 3134590 I1211 02:03:55.532346 1 controller.go:319] apiserver closed stream I1211 02:03:55.534921 1 controller.go:312] start watching at 3134590 I1211 02:04:58.831721 1 controller.go:319] apiserver closed stream I1211 02:04:58.835547 1 controller.go:312] start watching at 3134590 I1211 02:04:59.350604 1 controller.go:173] finding existing jobs... I1211 02:04:59.352893 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:04:59.352909 1 controller.go:89] Starting watch at version %v3135074 I1211 02:04:59.352913 1 controller.go:98] starts running from watch version: 3135074 I1211 02:04:59.353763 1 controller.go:312] start watching at 3135074 I1211 02:06:38.012048 1 controller.go:319] apiserver closed stream I1211 02:06:38.013109 1 controller.go:312] start watching at 3135074 I1211 02:08:26.999784 1 controller.go:319] apiserver closed stream I1211 02:08:27.000969 1 controller.go:312] start watching at 3135074 I1211 02:09:37.579628 1 controller.go:319] apiserver closed stream I1211 02:09:37.582728 1 controller.go:312] start watching at 3135074 I1211 02:11:30.157687 1 controller.go:319] apiserver closed stream I1211 02:11:30.161539 1 controller.go:312] start watching at 3135074 I1211 02:11:30.669945 1 controller.go:173] finding existing jobs... I1211 02:11:30.672260 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:11:30.672272 1 controller.go:89] Starting watch at version %v3135832 I1211 02:11:30.672275 1 controller.go:98] starts running from watch version: 3135832 I1211 02:11:30.673279 1 controller.go:312] start watching at 3135832 I1211 02:13:13.639957 1 controller.go:319] apiserver closed stream I1211 02:13:13.641883 1 controller.go:312] start watching at 3135832 I1211 02:14:28.197606 1 controller.go:319] apiserver closed stream I1211 02:14:28.198822 1 controller.go:312] start watching at 3135832 I1211 02:16:06.888469 1 controller.go:319] apiserver closed stream I1211 02:16:06.892350 1 controller.go:312] start watching at 3135832 I1211 02:16:07.402155 1 controller.go:173] finding existing jobs... I1211 02:16:07.404444 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:16:07.404462 1 controller.go:89] Starting watch at version %v3136371 I1211 02:16:07.404465 1 controller.go:98] starts running from watch version: 3136371 I1211 02:16:07.405464 1 controller.go:312] start watching at 3136371 I1211 02:18:03.967366 1 controller.go:319] apiserver closed stream I1211 02:18:03.968620 1 controller.go:312] start watching at 3136371 I1211 02:19:21.494499 1 controller.go:319] apiserver closed stream I1211 02:19:21.495637 1 controller.go:312] start watching at 3136371 I1211 02:20:37.845805 1 controller.go:319] apiserver closed stream I1211 02:20:37.849700 1 controller.go:312] start watching at 3136371 I1211 02:20:38.358026 1 controller.go:173] finding existing jobs... I1211 02:20:38.361668 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:20:38.361685 1 controller.go:89] Starting watch at version %v3136890 I1211 02:20:38.361689 1 controller.go:98] starts running from watch version: 3136890 I1211 02:20:38.362578 1 controller.go:312] start watching at 3136890 I1211 02:22:14.713933 1 controller.go:319] apiserver closed stream I1211 02:22:14.716006 1 controller.go:312] start watching at 3136890 I1211 02:23:28.908543 1 controller.go:319] apiserver closed stream I1211 02:23:28.909744 1 controller.go:312] start watching at 3136890 I1211 02:25:11.552978 1 controller.go:319] apiserver closed stream I1211 02:25:11.561707 1 controller.go:312] start watching at 3136890 I1211 02:25:12.089096 1 controller.go:173] finding existing jobs... I1211 02:25:12.095414 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:25:12.095444 1 controller.go:89] Starting watch at version %v3137429 I1211 02:25:12.095447 1 controller.go:98] starts running from watch version: 3137429 I1211 02:25:12.097875 1 controller.go:312] start watching at 3137429 I1211 02:26:33.200648 1 controller.go:319] apiserver closed stream I1211 02:26:33.203366 1 controller.go:312] start watching at 3137429 I1211 02:28:04.933980 1 controller.go:319] apiserver closed stream I1211 02:28:04.935113 1 controller.go:312] start watching at 3137429 I1211 02:29:28.221071 1 controller.go:319] apiserver closed stream I1211 02:29:28.222116 1 controller.go:312] start watching at 3137429 I1211 02:31:13.652609 1 controller.go:319] apiserver closed stream I1211 02:31:13.656581 1 controller.go:312] start watching at 3137429 I1211 02:31:14.165567 1 controller.go:173] finding existing jobs... I1211 02:31:14.167736 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:31:14.167755 1 controller.go:89] Starting watch at version %v3138136 I1211 02:31:14.167758 1 controller.go:98] starts running from watch version: 3138136 I1211 02:31:14.168819 1 controller.go:312] start watching at 3138136 I1211 02:33:10.825295 1 controller.go:319] apiserver closed stream I1211 02:33:10.826577 1 controller.go:312] start watching at 3138136 I1211 02:34:27.153083 1 controller.go:319] apiserver closed stream I1211 02:34:27.154515 1 controller.go:312] start watching at 3138136 I1211 02:35:50.917053 1 controller.go:319] apiserver closed stream I1211 02:35:50.920749 1 controller.go:312] start watching at 3138136 I1211 02:35:51.429062 1 controller.go:173] finding existing jobs... I1211 02:35:51.431202 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:35:51.431231 1 controller.go:89] Starting watch at version %v3138675 I1211 02:35:51.431236 1 controller.go:98] starts running from watch version: 3138675 I1211 02:35:51.432186 1 controller.go:312] start watching at 3138675 I1211 02:37:51.156190 1 controller.go:319] apiserver closed stream I1211 02:37:51.157633 1 controller.go:312] start watching at 3138675 I1211 02:39:15.426553 1 controller.go:319] apiserver closed stream I1211 02:39:15.444699 1 controller.go:312] start watching at 3138675 I1211 02:40:25.609560 1 controller.go:319] apiserver closed stream I1211 02:40:25.613501 1 controller.go:312] start watching at 3138675 I1211 02:40:26.140358 1 controller.go:173] finding existing jobs... I1211 02:40:26.143504 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:40:26.143523 1 controller.go:89] Starting watch at version %v3139217 I1211 02:40:26.143526 1 controller.go:98] starts running from watch version: 3139217 I1211 02:40:26.161746 1 controller.go:312] start watching at 3139217 I1211 02:42:16.331343 1 controller.go:319] apiserver closed stream I1211 02:42:16.332955 1 controller.go:312] start watching at 3139217 I1211 02:43:47.871826 1 controller.go:319] apiserver closed stream I1211 02:43:47.872903 1 controller.go:312] start watching at 3139217 I1211 02:45:43.750298 1 controller.go:319] apiserver closed stream I1211 02:45:43.751591 1 controller.go:312] start watching at 3139217 I1211 02:45:44.261152 1 controller.go:173] finding existing jobs... I1211 02:45:44.263399 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:45:44.263418 1 controller.go:89] Starting watch at version %v3139833 I1211 02:45:44.263421 1 controller.go:98] starts running from watch version: 3139833 I1211 02:45:44.264359 1 controller.go:312] start watching at 3139833 I1211 02:47:21.490879 1 controller.go:319] apiserver closed stream I1211 02:47:21.491791 1 controller.go:312] start watching at 3139833 I1211 02:49:19.632828 1 controller.go:319] apiserver closed stream I1211 02:49:19.634121 1 controller.go:312] start watching at 3139833 I1211 02:51:13.852776 1 controller.go:319] apiserver closed stream I1211 02:51:13.860420 1 controller.go:312] start watching at 3139833 I1211 02:51:14.373168 1 controller.go:173] finding existing jobs... I1211 02:51:14.376763 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:51:14.376780 1 controller.go:89] Starting watch at version %v3140478 I1211 02:51:14.376783 1 controller.go:98] starts running from watch version: 3140478 I1211 02:51:14.377650 1 controller.go:312] start watching at 3140478 I1211 02:53:04.278653 1 controller.go:319] apiserver closed stream I1211 02:53:04.279845 1 controller.go:312] start watching at 3140478 I1211 02:54:42.283400 1 controller.go:319] apiserver closed stream I1211 02:54:42.284536 1 controller.go:312] start watching at 3140478 I1211 02:56:19.717054 1 controller.go:319] apiserver closed stream I1211 02:56:19.720382 1 controller.go:312] start watching at 3140478 I1211 02:56:20.246590 1 controller.go:173] finding existing jobs... I1211 02:56:20.248900 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 02:56:20.248914 1 controller.go:89] Starting watch at version %v3141072 I1211 02:56:20.248918 1 controller.go:98] starts running from watch version: 3141072 I1211 02:56:20.249908 1 controller.go:312] start watching at 3141072 I1211 02:58:12.481696 1 controller.go:319] apiserver closed stream I1211 02:58:12.482958 1 controller.go:312] start watching at 3141072 I1211 03:00:07.720749 1 controller.go:319] apiserver closed stream I1211 03:00:07.724685 1 controller.go:312] start watching at 3141072 I1211 03:00:08.233451 1 controller.go:173] finding existing jobs... I1211 03:00:08.235795 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:00:08.235815 1 controller.go:89] Starting watch at version %v3141513 I1211 03:00:08.235818 1 controller.go:98] starts running from watch version: 3141513 I1211 03:00:08.236834 1 controller.go:312] start watching at 3141513 I1211 03:01:30.941550 1 controller.go:319] apiserver closed stream I1211 03:01:30.943106 1 controller.go:312] start watching at 3141513 I1211 03:02:43.237704 1 controller.go:319] apiserver closed stream I1211 03:02:43.239075 1 controller.go:312] start watching at 3141513 I1211 03:04:23.641171 1 controller.go:319] apiserver closed stream I1211 03:04:23.642413 1 controller.go:312] start watching at 3141513 I1211 03:05:40.353124 1 controller.go:319] apiserver closed stream I1211 03:05:40.356285 1 controller.go:312] start watching at 3141513 I1211 03:05:40.864532 1 controller.go:173] finding existing jobs... I1211 03:05:40.866687 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:05:40.866705 1 controller.go:89] Starting watch at version %v3142161 I1211 03:05:40.866708 1 controller.go:98] starts running from watch version: 3142161 I1211 03:05:40.867697 1 controller.go:312] start watching at 3142161 I1211 03:07:33.825231 1 controller.go:319] apiserver closed stream I1211 03:07:33.826358 1 controller.go:312] start watching at 3142161 I1211 03:08:51.713452 1 controller.go:319] apiserver closed stream I1211 03:08:51.716306 1 controller.go:312] start watching at 3142161 I1211 03:10:36.326274 1 controller.go:319] apiserver closed stream I1211 03:10:36.330851 1 controller.go:312] start watching at 3142161 I1211 03:10:36.843434 1 controller.go:173] finding existing jobs... I1211 03:10:36.847781 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:10:36.847799 1 controller.go:89] Starting watch at version %v3142736 I1211 03:10:36.847802 1 controller.go:98] starts running from watch version: 3142736 I1211 03:10:36.849062 1 controller.go:312] start watching at 3142736 I1211 03:12:25.124161 1 controller.go:319] apiserver closed stream I1211 03:12:25.125170 1 controller.go:312] start watching at 3142736 I1211 03:13:57.664136 1 controller.go:319] apiserver closed stream I1211 03:13:57.665018 1 controller.go:312] start watching at 3142736 I1211 03:15:42.397348 1 controller.go:319] apiserver closed stream I1211 03:15:42.401854 1 controller.go:312] start watching at 3142736 I1211 03:15:42.910742 1 controller.go:173] finding existing jobs... I1211 03:15:42.913129 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:15:42.913147 1 controller.go:89] Starting watch at version %v3143336 I1211 03:15:42.913150 1 controller.go:98] starts running from watch version: 3143336 I1211 03:15:42.914240 1 controller.go:312] start watching at 3143336 I1211 03:17:37.074534 1 controller.go:319] apiserver closed stream I1211 03:17:37.075755 1 controller.go:312] start watching at 3143336 I1211 03:19:04.401509 1 controller.go:319] apiserver closed stream I1211 03:19:04.439423 1 controller.go:312] start watching at 3143336 I1211 03:20:15.228698 1 controller.go:319] apiserver closed stream I1211 03:20:15.234292 1 controller.go:312] start watching at 3143336 I1211 03:20:15.746175 1 controller.go:173] finding existing jobs... I1211 03:20:15.749199 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:20:15.749232 1 controller.go:89] Starting watch at version %v3143868 I1211 03:20:15.749236 1 controller.go:98] starts running from watch version: 3143868 I1211 03:20:15.750181 1 controller.go:312] start watching at 3143868 I1211 03:22:08.618745 1 controller.go:319] apiserver closed stream I1211 03:22:08.620054 1 controller.go:312] start watching at 3143868 I1211 03:23:12.246565 1 controller.go:319] apiserver closed stream I1211 03:23:12.247777 1 controller.go:312] start watching at 3143868 I1211 03:25:12.040139 1 controller.go:319] apiserver closed stream I1211 03:25:12.044078 1 controller.go:312] start watching at 3143868 I1211 03:25:12.552598 1 controller.go:173] finding existing jobs... I1211 03:25:12.555437 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:25:12.555454 1 controller.go:89] Starting watch at version %v3144448 I1211 03:25:12.555457 1 controller.go:98] starts running from watch version: 3144448 I1211 03:25:12.556752 1 controller.go:312] start watching at 3144448 I1211 03:27:09.036761 1 controller.go:319] apiserver closed stream I1211 03:27:09.037877 1 controller.go:312] start watching at 3144448 I1211 03:28:19.353859 1 controller.go:319] apiserver closed stream I1211 03:28:19.355053 1 controller.go:312] start watching at 3144448 I1211 03:29:24.313705 1 controller.go:319] apiserver closed stream I1211 03:29:24.314921 1 controller.go:312] start watching at 3144448 I1211 03:30:40.547789 1 controller.go:319] apiserver closed stream I1211 03:30:40.551450 1 controller.go:312] start watching at 3144448 I1211 03:30:41.062009 1 controller.go:173] finding existing jobs... I1211 03:30:41.064428 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:30:41.064447 1 controller.go:89] Starting watch at version %v3145088 I1211 03:30:41.064451 1 controller.go:98] starts running from watch version: 3145088 I1211 03:30:41.065732 1 controller.go:312] start watching at 3145088 I1211 03:32:33.390731 1 controller.go:319] apiserver closed stream I1211 03:32:33.391758 1 controller.go:312] start watching at 3145088 I1211 03:34:25.614030 1 controller.go:319] apiserver closed stream I1211 03:34:25.615272 1 controller.go:312] start watching at 3145088 I1211 03:36:24.386554 1 controller.go:319] apiserver closed stream I1211 03:36:24.390537 1 controller.go:312] start watching at 3145088 I1211 03:36:24.900014 1 controller.go:173] finding existing jobs... I1211 03:36:24.903178 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:36:24.903197 1 controller.go:89] Starting watch at version %v3145758 I1211 03:36:24.903201 1 controller.go:98] starts running from watch version: 3145758 I1211 03:36:24.904537 1 controller.go:312] start watching at 3145758 I1211 03:37:51.210096 1 controller.go:319] apiserver closed stream I1211 03:37:51.211175 1 controller.go:312] start watching at 3145758 I1211 03:39:18.442761 1 controller.go:319] apiserver closed stream I1211 03:39:18.444647 1 controller.go:312] start watching at 3145758 I1211 03:40:53.162354 1 controller.go:319] apiserver closed stream I1211 03:40:53.167284 1 controller.go:312] start watching at 3145758 I1211 03:40:53.675831 1 controller.go:173] finding existing jobs... I1211 03:40:53.678249 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:40:53.678265 1 controller.go:89] Starting watch at version %v3146278 I1211 03:40:53.678268 1 controller.go:98] starts running from watch version: 3146278 I1211 03:40:53.679139 1 controller.go:312] start watching at 3146278 I1211 03:42:47.062418 1 controller.go:319] apiserver closed stream I1211 03:42:47.063411 1 controller.go:312] start watching at 3146278 I1211 03:44:12.024457 1 controller.go:319] apiserver closed stream I1211 03:44:12.025948 1 controller.go:312] start watching at 3146278 I1211 03:45:53.337153 1 controller.go:319] apiserver closed stream I1211 03:45:53.341375 1 controller.go:312] start watching at 3146278 I1211 03:45:53.849434 1 controller.go:173] finding existing jobs... I1211 03:45:53.851678 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:45:53.851698 1 controller.go:89] Starting watch at version %v3146863 I1211 03:45:53.851701 1 controller.go:98] starts running from watch version: 3146863 I1211 03:45:53.852662 1 controller.go:312] start watching at 3146863 I1211 03:46:56.871208 1 controller.go:319] apiserver closed stream I1211 03:46:56.872484 1 controller.go:312] start watching at 3146863 I1211 03:48:24.611690 1 controller.go:319] apiserver closed stream I1211 03:48:24.612978 1 controller.go:312] start watching at 3146863 I1211 03:49:55.616917 1 controller.go:319] apiserver closed stream I1211 03:49:55.622272 1 controller.go:312] start watching at 3146863 I1211 03:49:56.133412 1 controller.go:173] finding existing jobs... I1211 03:49:56.135779 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:49:56.135798 1 controller.go:89] Starting watch at version %v3147334 I1211 03:49:56.135801 1 controller.go:98] starts running from watch version: 3147334 I1211 03:49:56.136768 1 controller.go:312] start watching at 3147334 I1211 03:51:42.140458 1 controller.go:319] apiserver closed stream I1211 03:51:42.142974 1 controller.go:312] start watching at 3147334 I1211 03:52:44.421369 1 controller.go:319] apiserver closed stream I1211 03:52:44.422854 1 controller.go:312] start watching at 3147334 I1211 03:53:56.541126 1 controller.go:319] apiserver closed stream I1211 03:53:56.543349 1 controller.go:312] start watching at 3147334 I1211 03:55:24.746125 1 controller.go:319] apiserver closed stream I1211 03:55:24.749822 1 controller.go:312] start watching at 3147334 I1211 03:55:25.258404 1 controller.go:173] finding existing jobs... I1211 03:55:25.260674 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 03:55:25.260690 1 controller.go:89] Starting watch at version %v3147970 I1211 03:55:25.260693 1 controller.go:98] starts running from watch version: 3147970 I1211 03:55:25.261823 1 controller.go:312] start watching at 3147970 I1211 03:57:08.212877 1 controller.go:319] apiserver closed stream I1211 03:57:08.214141 1 controller.go:312] start watching at 3147970 I1211 03:58:18.389086 1 controller.go:319] apiserver closed stream I1211 03:58:18.391684 1 controller.go:312] start watching at 3147970 I1211 03:59:25.812785 1 controller.go:319] apiserver closed stream I1211 03:59:25.814084 1 controller.go:312] start watching at 3147970 I1211 04:01:06.360404 1 controller.go:319] apiserver closed stream I1211 04:01:06.364360 1 controller.go:312] start watching at 3147970 I1211 04:01:06.873517 1 controller.go:173] finding existing jobs... I1211 04:01:06.875621 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:01:06.875647 1 controller.go:89] Starting watch at version %v3148643 I1211 04:01:06.875650 1 controller.go:98] starts running from watch version: 3148643 I1211 04:01:06.876552 1 controller.go:312] start watching at 3148643 I1211 04:02:18.731561 1 controller.go:319] apiserver closed stream I1211 04:02:18.732791 1 controller.go:312] start watching at 3148643 I1211 04:03:40.875861 1 controller.go:319] apiserver closed stream I1211 04:03:40.879104 1 controller.go:312] start watching at 3148643 I1211 04:05:02.930476 1 controller.go:319] apiserver closed stream I1211 04:05:02.935052 1 controller.go:312] start watching at 3148643 I1211 04:05:03.443612 1 controller.go:173] finding existing jobs... I1211 04:05:03.446425 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:05:03.446440 1 controller.go:89] Starting watch at version %v3149105 I1211 04:05:03.446444 1 controller.go:98] starts running from watch version: 3149105 I1211 04:05:03.448102 1 controller.go:312] start watching at 3149105 I1211 04:06:04.923629 1 controller.go:319] apiserver closed stream I1211 04:06:04.924916 1 controller.go:312] start watching at 3149105 I1211 04:08:04.271967 1 controller.go:319] apiserver closed stream I1211 04:08:04.273181 1 controller.go:312] start watching at 3149105 I1211 04:09:34.927456 1 controller.go:319] apiserver closed stream I1211 04:09:34.928586 1 controller.go:312] start watching at 3149105 I1211 04:11:14.108852 1 controller.go:319] apiserver closed stream I1211 04:11:14.113346 1 controller.go:312] start watching at 3149105 I1211 04:11:14.622436 1 controller.go:173] finding existing jobs... I1211 04:11:14.624803 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:11:14.624828 1 controller.go:89] Starting watch at version %v3149854 I1211 04:11:14.624831 1 controller.go:98] starts running from watch version: 3149854 I1211 04:11:14.626141 1 controller.go:312] start watching at 3149854 I1211 04:13:00.565250 1 controller.go:319] apiserver closed stream I1211 04:13:00.567979 1 controller.go:312] start watching at 3149854 I1211 04:14:58.456294 1 controller.go:319] apiserver closed stream I1211 04:14:58.462126 1 controller.go:312] start watching at 3149854 I1211 04:14:58.969979 1 controller.go:173] finding existing jobs... I1211 04:14:58.972302 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:14:58.972317 1 controller.go:89] Starting watch at version %v3150290 I1211 04:14:58.972320 1 controller.go:98] starts running from watch version: 3150290 I1211 04:14:58.973241 1 controller.go:312] start watching at 3150290 I1211 04:16:51.536507 1 controller.go:319] apiserver closed stream I1211 04:16:51.537616 1 controller.go:312] start watching at 3150290 I1211 04:18:45.107673 1 controller.go:319] apiserver closed stream I1211 04:18:45.110296 1 controller.go:312] start watching at 3150290 I1211 04:20:25.790889 1 controller.go:319] apiserver closed stream I1211 04:20:25.795164 1 controller.go:312] start watching at 3150290 I1211 04:20:26.318911 1 controller.go:173] finding existing jobs... I1211 04:20:26.321948 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:20:26.321966 1 controller.go:89] Starting watch at version %v3150930 I1211 04:20:26.321969 1 controller.go:98] starts running from watch version: 3150930 I1211 04:20:26.322874 1 controller.go:312] start watching at 3150930 I1211 04:22:23.271555 1 controller.go:319] apiserver closed stream I1211 04:22:23.272742 1 controller.go:312] start watching at 3150930 I1211 04:24:13.183135 1 controller.go:319] apiserver closed stream I1211 04:24:13.184358 1 controller.go:312] start watching at 3150930 I1211 04:25:17.641333 1 controller.go:319] apiserver closed stream I1211 04:25:17.645762 1 controller.go:312] start watching at 3150930 I1211 04:25:18.178473 1 controller.go:173] finding existing jobs... I1211 04:25:18.184579 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:25:18.184597 1 controller.go:89] Starting watch at version %v3151532 I1211 04:25:18.184600 1 controller.go:98] starts running from watch version: 3151532 I1211 04:25:18.185615 1 controller.go:312] start watching at 3151532 I1211 04:26:39.083155 1 controller.go:319] apiserver closed stream I1211 04:26:39.084041 1 controller.go:312] start watching at 3151532 I1211 04:28:14.898807 1 controller.go:319] apiserver closed stream I1211 04:28:14.900169 1 controller.go:312] start watching at 3151532 I1211 04:30:05.107979 1 controller.go:319] apiserver closed stream I1211 04:30:05.111441 1 controller.go:312] start watching at 3151532 I1211 04:30:05.621510 1 controller.go:173] finding existing jobs... I1211 04:30:05.628378 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:30:05.628397 1 controller.go:89] Starting watch at version %v3152095 I1211 04:30:05.628401 1 controller.go:98] starts running from watch version: 3152095 I1211 04:30:05.629822 1 controller.go:312] start watching at 3152095 I1211 04:31:28.105579 1 controller.go:319] apiserver closed stream I1211 04:31:28.106684 1 controller.go:312] start watching at 3152095 I1211 04:32:40.391060 1 controller.go:319] apiserver closed stream I1211 04:32:40.392175 1 controller.go:312] start watching at 3152095 I1211 04:34:40.177588 1 controller.go:319] apiserver closed stream I1211 04:34:40.178811 1 controller.go:312] start watching at 3152095 I1211 04:35:45.488632 1 controller.go:319] apiserver closed stream I1211 04:35:45.492746 1 controller.go:312] start watching at 3152095 I1211 04:35:46.004505 1 controller.go:173] finding existing jobs... I1211 04:35:46.009127 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:35:46.009157 1 controller.go:89] Starting watch at version %v3152755 I1211 04:35:46.009161 1 controller.go:98] starts running from watch version: 3152755 I1211 04:35:46.010665 1 controller.go:312] start watching at 3152755 I1211 04:37:35.743479 1 controller.go:319] apiserver closed stream I1211 04:37:35.744642 1 controller.go:312] start watching at 3152755 I1211 04:38:36.333984 1 controller.go:319] apiserver closed stream I1211 04:38:36.335020 1 controller.go:312] start watching at 3152755 I1211 04:40:30.143108 1 controller.go:319] apiserver closed stream I1211 04:40:30.146476 1 controller.go:312] start watching at 3152755 I1211 04:40:30.654142 1 controller.go:173] finding existing jobs... I1211 04:40:30.656436 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:40:30.656456 1 controller.go:89] Starting watch at version %v3153311 I1211 04:40:30.656459 1 controller.go:98] starts running from watch version: 3153311 I1211 04:40:30.657289 1 controller.go:312] start watching at 3153311 I1211 04:41:38.019848 1 controller.go:319] apiserver closed stream I1211 04:41:38.022339 1 controller.go:312] start watching at 3153311 I1211 04:43:33.854161 1 controller.go:319] apiserver closed stream I1211 04:43:33.855206 1 controller.go:312] start watching at 3153311 I1211 04:44:38.171035 1 controller.go:319] apiserver closed stream I1211 04:44:38.172099 1 controller.go:312] start watching at 3153311 I1211 04:46:36.860345 1 controller.go:319] apiserver closed stream I1211 04:46:36.865044 1 controller.go:312] start watching at 3153311 I1211 04:46:37.373331 1 controller.go:173] finding existing jobs... I1211 04:46:37.375708 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:46:37.375725 1 controller.go:89] Starting watch at version %v3154027 I1211 04:46:37.375728 1 controller.go:98] starts running from watch version: 3154027 I1211 04:46:37.376677 1 controller.go:312] start watching at 3154027 I1211 04:47:45.026305 1 controller.go:319] apiserver closed stream I1211 04:47:45.027984 1 controller.go:312] start watching at 3154027 I1211 04:49:13.527526 1 controller.go:319] apiserver closed stream I1211 04:49:13.528751 1 controller.go:312] start watching at 3154027 I1211 04:50:36.799017 1 controller.go:319] apiserver closed stream I1211 04:50:36.803308 1 controller.go:312] start watching at 3154027 I1211 04:50:37.369451 1 controller.go:173] finding existing jobs... I1211 04:50:37.372979 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:50:37.372998 1 controller.go:89] Starting watch at version %v3154497 I1211 04:50:37.373001 1 controller.go:98] starts running from watch version: 3154497 I1211 04:50:37.374313 1 controller.go:312] start watching at 3154497 I1211 04:52:02.028459 1 controller.go:319] apiserver closed stream I1211 04:52:02.030133 1 controller.go:312] start watching at 3154497 I1211 04:53:51.888256 1 controller.go:319] apiserver closed stream I1211 04:53:51.889232 1 controller.go:312] start watching at 3154497 I1211 04:55:48.637875 1 controller.go:319] apiserver closed stream I1211 04:55:48.642783 1 controller.go:312] start watching at 3154497 I1211 04:55:49.152452 1 controller.go:173] finding existing jobs... I1211 04:55:49.154576 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 04:55:49.154594 1 controller.go:89] Starting watch at version %v3155101 I1211 04:55:49.154597 1 controller.go:98] starts running from watch version: 3155101 I1211 04:55:49.155519 1 controller.go:312] start watching at 3155101 I1211 04:56:59.494671 1 controller.go:319] apiserver closed stream I1211 04:56:59.496022 1 controller.go:312] start watching at 3155101 I1211 04:58:41.794635 1 controller.go:319] apiserver closed stream I1211 04:58:41.795756 1 controller.go:312] start watching at 3155101 I1211 05:00:09.102787 1 controller.go:319] apiserver closed stream I1211 05:00:09.106966 1 controller.go:312] start watching at 3155101 I1211 05:00:09.615142 1 controller.go:173] finding existing jobs... I1211 05:00:09.617317 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:00:09.617331 1 controller.go:89] Starting watch at version %v3155604 I1211 05:00:09.617334 1 controller.go:98] starts running from watch version: 3155604 I1211 05:00:09.618630 1 controller.go:312] start watching at 3155604 I1211 05:01:49.869599 1 controller.go:319] apiserver closed stream I1211 05:01:49.872059 1 controller.go:312] start watching at 3155604 I1211 05:03:27.062318 1 controller.go:319] apiserver closed stream I1211 05:03:27.063439 1 controller.go:312] start watching at 3155604 I1211 05:04:31.420822 1 controller.go:319] apiserver closed stream I1211 05:04:31.422401 1 controller.go:312] start watching at 3155604 I1211 05:05:36.646297 1 controller.go:319] apiserver closed stream I1211 05:05:36.651029 1 controller.go:312] start watching at 3155604 I1211 05:05:37.159041 1 controller.go:173] finding existing jobs... I1211 05:05:37.161271 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:05:37.161287 1 controller.go:89] Starting watch at version %v3156246 I1211 05:05:37.161291 1 controller.go:98] starts running from watch version: 3156246 I1211 05:05:37.162281 1 controller.go:312] start watching at 3156246 I1211 05:06:59.230029 1 controller.go:319] apiserver closed stream I1211 05:06:59.231144 1 controller.go:312] start watching at 3156246 I1211 05:08:47.696822 1 controller.go:319] apiserver closed stream I1211 05:08:47.697906 1 controller.go:312] start watching at 3156246 I1211 05:10:13.031512 1 controller.go:319] apiserver closed stream I1211 05:10:13.038477 1 controller.go:312] start watching at 3156246 I1211 05:10:13.547247 1 controller.go:173] finding existing jobs... I1211 05:10:13.549731 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:10:13.549748 1 controller.go:89] Starting watch at version %v3156781 I1211 05:10:13.549751 1 controller.go:98] starts running from watch version: 3156781 I1211 05:10:13.550586 1 controller.go:312] start watching at 3156781 I1211 05:11:41.130024 1 controller.go:319] apiserver closed stream I1211 05:11:41.131359 1 controller.go:312] start watching at 3156781 I1211 05:12:58.184667 1 controller.go:319] apiserver closed stream I1211 05:12:58.185936 1 controller.go:312] start watching at 3156781 I1211 05:14:25.965055 1 controller.go:319] apiserver closed stream I1211 05:14:25.966311 1 controller.go:312] start watching at 3156781 I1211 05:16:12.764922 1 controller.go:319] apiserver closed stream I1211 05:16:12.861205 1 controller.go:312] start watching at 3156781 I1211 05:16:13.386813 1 controller.go:173] finding existing jobs... I1211 05:16:13.389199 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:16:13.389226 1 controller.go:89] Starting watch at version %v3157487 I1211 05:16:13.389230 1 controller.go:98] starts running from watch version: 3157487 I1211 05:16:13.389988 1 controller.go:312] start watching at 3157487 I1211 05:17:33.274363 1 controller.go:319] apiserver closed stream I1211 05:17:33.275524 1 controller.go:312] start watching at 3157487 I1211 05:18:45.160859 1 controller.go:319] apiserver closed stream I1211 05:18:45.162195 1 controller.go:312] start watching at 3157487 I1211 05:20:13.836370 1 controller.go:319] apiserver closed stream I1211 05:20:13.842164 1 controller.go:312] start watching at 3157487 I1211 05:20:14.353514 1 controller.go:173] finding existing jobs... I1211 05:20:14.356030 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:20:14.356062 1 controller.go:89] Starting watch at version %v3157954 I1211 05:20:14.356065 1 controller.go:98] starts running from watch version: 3157954 I1211 05:20:14.356839 1 controller.go:312] start watching at 3157954 I1211 05:21:32.860620 1 controller.go:319] apiserver closed stream I1211 05:21:32.861829 1 controller.go:312] start watching at 3157954 I1211 05:22:50.742782 1 controller.go:319] apiserver closed stream I1211 05:22:50.744130 1 controller.go:312] start watching at 3157954 I1211 05:24:23.040102 1 controller.go:319] apiserver closed stream I1211 05:24:23.041339 1 controller.go:312] start watching at 3157954 I1211 05:25:29.144958 1 controller.go:319] apiserver closed stream I1211 05:25:29.148911 1 controller.go:312] start watching at 3157954 I1211 05:25:29.657582 1 controller.go:173] finding existing jobs... I1211 05:25:29.659786 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:25:29.659809 1 controller.go:89] Starting watch at version %v3158566 I1211 05:25:29.659812 1 controller.go:98] starts running from watch version: 3158566 I1211 05:25:29.660778 1 controller.go:312] start watching at 3158566 I1211 05:26:48.532868 1 controller.go:319] apiserver closed stream I1211 05:26:48.533914 1 controller.go:312] start watching at 3158566 I1211 05:27:52.999985 1 controller.go:319] apiserver closed stream I1211 05:27:53.001141 1 controller.go:312] start watching at 3158566 I1211 05:29:31.371342 1 controller.go:319] apiserver closed stream I1211 05:29:31.372620 1 controller.go:312] start watching at 3158566 I1211 05:31:08.566048 1 controller.go:319] apiserver closed stream I1211 05:31:08.570022 1 controller.go:312] start watching at 3158566 I1211 05:31:09.078423 1 controller.go:173] finding existing jobs... I1211 05:31:09.080793 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:31:09.080810 1 controller.go:89] Starting watch at version %v3159232 I1211 05:31:09.080813 1 controller.go:98] starts running from watch version: 3159232 I1211 05:31:09.081719 1 controller.go:312] start watching at 3159232 I1211 05:32:42.924254 1 controller.go:319] apiserver closed stream I1211 05:32:42.926938 1 controller.go:312] start watching at 3159232 I1211 05:34:16.166206 1 controller.go:319] apiserver closed stream I1211 05:34:16.169497 1 controller.go:312] start watching at 3159232 I1211 05:35:47.022018 1 controller.go:319] apiserver closed stream I1211 05:35:47.037472 1 controller.go:312] start watching at 3159232 I1211 05:35:47.553880 1 controller.go:173] finding existing jobs... I1211 05:35:47.556326 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:35:47.556344 1 controller.go:89] Starting watch at version %v3159773 I1211 05:35:47.556347 1 controller.go:98] starts running from watch version: 3159773 I1211 05:35:47.557269 1 controller.go:312] start watching at 3159773 I1211 05:37:20.425362 1 controller.go:319] apiserver closed stream I1211 05:37:20.426725 1 controller.go:312] start watching at 3159773 I1211 05:38:43.603936 1 controller.go:319] apiserver closed stream I1211 05:38:43.605100 1 controller.go:312] start watching at 3159773 I1211 05:39:44.165364 1 controller.go:319] apiserver closed stream I1211 05:39:44.166828 1 controller.go:312] start watching at 3159773 I1211 05:41:30.674850 1 controller.go:319] apiserver closed stream I1211 05:41:30.677989 1 controller.go:312] start watching at 3159773 I1211 05:41:31.186357 1 controller.go:173] finding existing jobs... I1211 05:41:31.188651 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:41:31.188671 1 controller.go:89] Starting watch at version %v3160438 I1211 05:41:31.188674 1 controller.go:98] starts running from watch version: 3160438 I1211 05:41:31.189565 1 controller.go:312] start watching at 3160438 I1211 05:43:04.797919 1 controller.go:319] apiserver closed stream I1211 05:43:04.800097 1 controller.go:312] start watching at 3160438 I1211 05:44:05.675977 1 controller.go:319] apiserver closed stream I1211 05:44:05.677111 1 controller.go:312] start watching at 3160438 I1211 05:45:59.621950 1 controller.go:319] apiserver closed stream I1211 05:45:59.626055 1 controller.go:312] start watching at 3160438 I1211 05:46:00.171094 1 controller.go:173] finding existing jobs... I1211 05:46:00.175820 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:46:00.175839 1 controller.go:89] Starting watch at version %v3160966 I1211 05:46:00.175843 1 controller.go:98] starts running from watch version: 3160966 I1211 05:46:00.176638 1 controller.go:312] start watching at 3160966 I1211 05:47:19.826325 1 controller.go:319] apiserver closed stream I1211 05:47:19.827525 1 controller.go:312] start watching at 3160966 I1211 05:49:08.332181 1 controller.go:319] apiserver closed stream I1211 05:49:08.333232 1 controller.go:312] start watching at 3160966 I1211 05:51:02.254948 1 controller.go:319] apiserver closed stream I1211 05:51:02.258723 1 controller.go:312] start watching at 3160966 I1211 05:51:02.766048 1 controller.go:173] finding existing jobs... I1211 05:51:02.768132 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:51:02.768156 1 controller.go:89] Starting watch at version %v3161558 I1211 05:51:02.768160 1 controller.go:98] starts running from watch version: 3161558 I1211 05:51:02.769056 1 controller.go:312] start watching at 3161558 I1211 05:52:05.082395 1 controller.go:319] apiserver closed stream I1211 05:52:05.084936 1 controller.go:312] start watching at 3161558 I1211 05:53:18.685072 1 controller.go:319] apiserver closed stream I1211 05:53:18.686242 1 controller.go:312] start watching at 3161558 I1211 05:54:49.551014 1 controller.go:319] apiserver closed stream I1211 05:54:49.552122 1 controller.go:312] start watching at 3161558 I1211 05:56:41.027343 1 controller.go:319] apiserver closed stream I1211 05:56:41.038707 1 controller.go:312] start watching at 3161558 I1211 05:56:41.552123 1 controller.go:173] finding existing jobs... I1211 05:56:41.554537 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 05:56:41.554552 1 controller.go:89] Starting watch at version %v3162216 I1211 05:56:41.554556 1 controller.go:98] starts running from watch version: 3162216 I1211 05:56:41.555532 1 controller.go:312] start watching at 3162216 I1211 05:58:15.886942 1 controller.go:319] apiserver closed stream I1211 05:58:15.889823 1 controller.go:312] start watching at 3162216 I1211 05:59:20.067599 1 controller.go:319] apiserver closed stream I1211 05:59:20.068858 1 controller.go:312] start watching at 3162216 I1211 06:00:28.304679 1 controller.go:319] apiserver closed stream I1211 06:00:28.309195 1 controller.go:312] start watching at 3162216 I1211 06:00:28.818084 1 controller.go:173] finding existing jobs... I1211 06:00:28.820624 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:00:28.820654 1 controller.go:89] Starting watch at version %v3162664 I1211 06:00:28.820657 1 controller.go:98] starts running from watch version: 3162664 I1211 06:00:28.821702 1 controller.go:312] start watching at 3162664 I1211 06:01:45.115697 1 controller.go:319] apiserver closed stream I1211 06:01:45.116835 1 controller.go:312] start watching at 3162664 I1211 06:02:53.033162 1 controller.go:319] apiserver closed stream I1211 06:02:53.035910 1 controller.go:312] start watching at 3162664 I1211 06:04:44.069167 1 controller.go:319] apiserver closed stream I1211 06:04:44.070459 1 controller.go:312] start watching at 3162664 I1211 06:06:22.059911 1 controller.go:319] apiserver closed stream I1211 06:06:22.061346 1 controller.go:312] start watching at 3162664 I1211 06:06:22.574170 1 controller.go:173] finding existing jobs... I1211 06:06:22.581581 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:06:22.581598 1 controller.go:89] Starting watch at version %v3163350 I1211 06:06:22.581602 1 controller.go:98] starts running from watch version: 3163350 I1211 06:06:22.584131 1 controller.go:312] start watching at 3163350 I1211 06:07:50.502579 1 controller.go:319] apiserver closed stream I1211 06:07:50.503667 1 controller.go:312] start watching at 3163350 I1211 06:09:04.588965 1 controller.go:319] apiserver closed stream I1211 06:09:04.591431 1 controller.go:312] start watching at 3163350 I1211 06:10:47.746520 1 controller.go:319] apiserver closed stream I1211 06:10:47.750334 1 controller.go:312] start watching at 3163350 I1211 06:10:48.259455 1 controller.go:173] finding existing jobs... I1211 06:10:48.262177 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:10:48.262198 1 controller.go:89] Starting watch at version %v3163865 I1211 06:10:48.262202 1 controller.go:98] starts running from watch version: 3163865 I1211 06:10:48.263167 1 controller.go:312] start watching at 3163865 I1211 06:12:05.708313 1 controller.go:319] apiserver closed stream I1211 06:12:05.709299 1 controller.go:312] start watching at 3163865 I1211 06:14:01.419797 1 controller.go:319] apiserver closed stream I1211 06:14:01.420949 1 controller.go:312] start watching at 3163865 I1211 06:15:15.145039 1 controller.go:319] apiserver closed stream I1211 06:15:15.150097 1 controller.go:312] start watching at 3163865 I1211 06:15:15.658406 1 controller.go:173] finding existing jobs... I1211 06:15:15.660751 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:15:15.660769 1 controller.go:89] Starting watch at version %v3164386 I1211 06:15:15.660785 1 controller.go:98] starts running from watch version: 3164386 I1211 06:15:15.661666 1 controller.go:312] start watching at 3164386 I1211 06:16:18.515280 1 controller.go:319] apiserver closed stream I1211 06:16:18.516375 1 controller.go:312] start watching at 3164386 I1211 06:17:59.919186 1 controller.go:319] apiserver closed stream I1211 06:17:59.920630 1 controller.go:312] start watching at 3164386 I1211 06:19:12.527157 1 controller.go:319] apiserver closed stream I1211 06:19:12.528488 1 controller.go:312] start watching at 3164386 I1211 06:20:37.387336 1 controller.go:319] apiserver closed stream I1211 06:20:37.391754 1 controller.go:312] start watching at 3164386 I1211 06:20:37.900969 1 controller.go:173] finding existing jobs... I1211 06:20:37.903196 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:20:37.903235 1 controller.go:89] Starting watch at version %v3165013 I1211 06:20:37.903244 1 controller.go:98] starts running from watch version: 3165013 I1211 06:20:37.904082 1 controller.go:312] start watching at 3165013 I1211 06:21:41.040523 1 controller.go:319] apiserver closed stream I1211 06:21:41.046799 1 controller.go:312] start watching at 3165013 I1211 06:23:16.666868 1 controller.go:319] apiserver closed stream I1211 06:23:16.668145 1 controller.go:312] start watching at 3165013 I1211 06:24:57.563757 1 controller.go:319] apiserver closed stream I1211 06:24:57.567847 1 controller.go:312] start watching at 3165013 I1211 06:24:58.075197 1 controller.go:173] finding existing jobs... I1211 06:24:58.078097 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:24:58.078113 1 controller.go:89] Starting watch at version %v3165523 I1211 06:24:58.078117 1 controller.go:98] starts running from watch version: 3165523 I1211 06:24:58.078940 1 controller.go:312] start watching at 3165523 I1211 06:26:00.137421 1 controller.go:319] apiserver closed stream I1211 06:26:00.139585 1 controller.go:312] start watching at 3165523 I1211 06:27:59.385821 1 controller.go:319] apiserver closed stream I1211 06:27:59.388186 1 controller.go:312] start watching at 3165523 I1211 06:29:15.152875 1 controller.go:319] apiserver closed stream I1211 06:29:15.154079 1 controller.go:312] start watching at 3165523 I1211 06:31:04.765571 1 controller.go:319] apiserver closed stream I1211 06:31:04.766842 1 controller.go:312] start watching at 3165523 I1211 06:31:05.275326 1 controller.go:173] finding existing jobs... I1211 06:31:05.277568 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:31:05.277586 1 controller.go:89] Starting watch at version %v3166240 I1211 06:31:05.277590 1 controller.go:98] starts running from watch version: 3166240 I1211 06:31:05.278484 1 controller.go:312] start watching at 3166240 I1211 06:32:07.173835 1 controller.go:319] apiserver closed stream I1211 06:32:07.174954 1 controller.go:312] start watching at 3166240 I1211 06:34:03.749027 1 controller.go:319] apiserver closed stream I1211 06:34:03.750226 1 controller.go:312] start watching at 3166240 I1211 06:36:02.649310 1 controller.go:319] apiserver closed stream I1211 06:36:02.653071 1 controller.go:312] start watching at 3166240 I1211 06:36:03.161518 1 controller.go:173] finding existing jobs... I1211 06:36:03.163640 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:36:03.163660 1 controller.go:89] Starting watch at version %v3166819 I1211 06:36:03.163663 1 controller.go:98] starts running from watch version: 3166819 I1211 06:36:03.164546 1 controller.go:312] start watching at 3166819 I1211 06:37:33.570246 1 controller.go:319] apiserver closed stream I1211 06:37:33.571158 1 controller.go:312] start watching at 3166819 I1211 06:38:51.244191 1 controller.go:319] apiserver closed stream I1211 06:38:51.245348 1 controller.go:312] start watching at 3166819 I1211 06:39:56.580931 1 controller.go:319] apiserver closed stream I1211 06:39:56.585294 1 controller.go:312] start watching at 3166819 I1211 06:39:57.093911 1 controller.go:173] finding existing jobs... I1211 06:39:57.096303 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:39:57.096320 1 controller.go:89] Starting watch at version %v3167280 I1211 06:39:57.096339 1 controller.go:98] starts running from watch version: 3167280 I1211 06:39:57.097279 1 controller.go:312] start watching at 3167280 I1211 06:41:49.496347 1 controller.go:319] apiserver closed stream I1211 06:41:49.497491 1 controller.go:312] start watching at 3167280 I1211 06:43:26.556081 1 controller.go:319] apiserver closed stream I1211 06:43:26.557290 1 controller.go:312] start watching at 3167280 I1211 06:44:54.644333 1 controller.go:319] apiserver closed stream I1211 06:44:54.648078 1 controller.go:312] start watching at 3167280 I1211 06:44:55.158310 1 controller.go:173] finding existing jobs... I1211 06:44:55.160543 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:44:55.160559 1 controller.go:89] Starting watch at version %v3167861 I1211 06:44:55.160562 1 controller.go:98] starts running from watch version: 3167861 I1211 06:44:55.161466 1 controller.go:312] start watching at 3167861 I1211 06:46:08.061686 1 controller.go:319] apiserver closed stream I1211 06:46:08.062919 1 controller.go:312] start watching at 3167861 I1211 06:48:07.579243 1 controller.go:319] apiserver closed stream I1211 06:48:07.581261 1 controller.go:312] start watching at 3167861 I1211 06:49:28.424169 1 controller.go:319] apiserver closed stream I1211 06:49:28.425344 1 controller.go:312] start watching at 3167861 I1211 06:50:44.424460 1 controller.go:319] apiserver closed stream I1211 06:50:44.428388 1 controller.go:312] start watching at 3167861 I1211 06:50:44.937418 1 controller.go:173] finding existing jobs... I1211 06:50:44.939508 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:50:44.939526 1 controller.go:89] Starting watch at version %v3168542 I1211 06:50:44.939529 1 controller.go:98] starts running from watch version: 3168542 I1211 06:50:44.940395 1 controller.go:312] start watching at 3168542 I1211 06:51:50.050609 1 controller.go:319] apiserver closed stream I1211 06:51:50.053184 1 controller.go:312] start watching at 3168542 I1211 06:53:02.829065 1 controller.go:319] apiserver closed stream I1211 06:53:02.830606 1 controller.go:312] start watching at 3168542 I1211 06:54:20.576433 1 controller.go:319] apiserver closed stream I1211 06:54:20.578066 1 controller.go:312] start watching at 3168542 I1211 06:55:27.916493 1 controller.go:319] apiserver closed stream I1211 06:55:27.919889 1 controller.go:312] start watching at 3168542 I1211 06:55:28.428386 1 controller.go:173] finding existing jobs... I1211 06:55:28.430630 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 06:55:28.430645 1 controller.go:89] Starting watch at version %v3169091 I1211 06:55:28.430649 1 controller.go:98] starts running from watch version: 3169091 I1211 06:55:28.431579 1 controller.go:312] start watching at 3169091 I1211 06:57:24.922072 1 controller.go:319] apiserver closed stream I1211 06:57:24.924544 1 controller.go:312] start watching at 3169091 I1211 06:59:09.140385 1 controller.go:319] apiserver closed stream I1211 06:59:09.141466 1 controller.go:312] start watching at 3169091 I1211 07:00:49.934315 1 controller.go:319] apiserver closed stream I1211 07:00:49.936910 1 controller.go:312] start watching at 3169091 I1211 07:00:50.444789 1 controller.go:173] finding existing jobs... I1211 07:00:50.447062 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:00:50.447109 1 controller.go:89] Starting watch at version %v3169717 I1211 07:00:50.447113 1 controller.go:98] starts running from watch version: 3169717 I1211 07:00:50.448568 1 controller.go:312] start watching at 3169717 I1211 07:01:58.658599 1 controller.go:319] apiserver closed stream I1211 07:01:58.659686 1 controller.go:312] start watching at 3169717 I1211 07:03:21.805502 1 controller.go:319] apiserver closed stream I1211 07:03:21.807146 1 controller.go:312] start watching at 3169717 I1211 07:04:32.437509 1 controller.go:319] apiserver closed stream I1211 07:04:32.438590 1 controller.go:312] start watching at 3169717 I1211 07:06:22.551782 1 controller.go:319] apiserver closed stream I1211 07:06:22.555344 1 controller.go:312] start watching at 3169717 I1211 07:06:23.063950 1 controller.go:173] finding existing jobs... I1211 07:06:23.066365 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:06:23.066383 1 controller.go:89] Starting watch at version %v3170363 I1211 07:06:23.066386 1 controller.go:98] starts running from watch version: 3170363 I1211 07:06:23.067370 1 controller.go:312] start watching at 3170363 I1211 07:07:27.048782 1 controller.go:319] apiserver closed stream I1211 07:07:27.051156 1 controller.go:312] start watching at 3170363 I1211 07:09:00.336941 1 controller.go:319] apiserver closed stream I1211 07:09:00.339187 1 controller.go:312] start watching at 3170363 I1211 07:10:32.697760 1 controller.go:319] apiserver closed stream I1211 07:10:32.701642 1 controller.go:312] start watching at 3170363 I1211 07:10:33.211371 1 controller.go:173] finding existing jobs... I1211 07:10:33.213556 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:10:33.213587 1 controller.go:89] Starting watch at version %v3170857 I1211 07:10:33.213590 1 controller.go:98] starts running from watch version: 3170857 I1211 07:10:33.214567 1 controller.go:312] start watching at 3170857 I1211 07:12:03.907940 1 controller.go:319] apiserver closed stream I1211 07:12:03.926407 1 controller.go:312] start watching at 3170857 I1211 07:13:38.319589 1 controller.go:319] apiserver closed stream I1211 07:13:38.320968 1 controller.go:312] start watching at 3170857 I1211 07:14:46.005603 1 controller.go:319] apiserver closed stream I1211 07:14:46.006768 1 controller.go:312] start watching at 3170857 I1211 07:16:14.593370 1 controller.go:319] apiserver closed stream I1211 07:16:14.597329 1 controller.go:312] start watching at 3170857 I1211 07:16:15.125804 1 controller.go:173] finding existing jobs... I1211 07:16:15.128634 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:16:15.128651 1 controller.go:89] Starting watch at version %v3171522 I1211 07:16:15.128654 1 controller.go:98] starts running from watch version: 3171522 I1211 07:16:15.129690 1 controller.go:312] start watching at 3171522 I1211 07:17:20.838291 1 controller.go:319] apiserver closed stream I1211 07:17:20.840493 1 controller.go:312] start watching at 3171522 I1211 07:18:57.571765 1 controller.go:319] apiserver closed stream I1211 07:18:57.572660 1 controller.go:312] start watching at 3171522 I1211 07:20:43.432198 1 controller.go:319] apiserver closed stream I1211 07:20:43.436328 1 controller.go:312] start watching at 3171522 I1211 07:20:43.945313 1 controller.go:173] finding existing jobs... I1211 07:20:43.947532 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:20:43.947581 1 controller.go:89] Starting watch at version %v3172050 I1211 07:20:43.947585 1 controller.go:98] starts running from watch version: 3172050 I1211 07:20:43.948549 1 controller.go:312] start watching at 3172050 I1211 07:22:29.982237 1 controller.go:319] apiserver closed stream I1211 07:22:29.984119 1 controller.go:312] start watching at 3172050 I1211 07:24:16.216896 1 controller.go:319] apiserver closed stream I1211 07:24:16.219245 1 controller.go:312] start watching at 3172050 I1211 07:26:12.355055 1 controller.go:319] apiserver closed stream I1211 07:26:12.359066 1 controller.go:312] start watching at 3172050 I1211 07:26:12.875253 1 controller.go:173] finding existing jobs... I1211 07:26:12.878556 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:26:12.878586 1 controller.go:89] Starting watch at version %v3172689 I1211 07:26:12.878589 1 controller.go:98] starts running from watch version: 3172689 I1211 07:26:12.879504 1 controller.go:312] start watching at 3172689 I1211 07:27:27.927786 1 controller.go:319] apiserver closed stream I1211 07:27:27.930487 1 controller.go:312] start watching at 3172689 I1211 07:29:25.590381 1 controller.go:319] apiserver closed stream I1211 07:29:25.592631 1 controller.go:312] start watching at 3172689 I1211 07:31:10.586303 1 controller.go:319] apiserver closed stream I1211 07:31:10.591015 1 controller.go:312] start watching at 3172689 I1211 07:31:11.099611 1 controller.go:173] finding existing jobs... I1211 07:31:11.101849 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:31:11.101868 1 controller.go:89] Starting watch at version %v3173272 I1211 07:31:11.101871 1 controller.go:98] starts running from watch version: 3173272 I1211 07:31:11.102803 1 controller.go:312] start watching at 3173272 I1211 07:32:35.803533 1 controller.go:319] apiserver closed stream I1211 07:32:35.804977 1 controller.go:312] start watching at 3173272 I1211 07:34:05.723436 1 controller.go:319] apiserver closed stream I1211 07:34:05.724577 1 controller.go:312] start watching at 3173272 I1211 07:35:22.082530 1 controller.go:319] apiserver closed stream I1211 07:35:22.086381 1 controller.go:312] start watching at 3173272 I1211 07:35:22.595556 1 controller.go:173] finding existing jobs... I1211 07:35:22.598335 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:35:22.598353 1 controller.go:89] Starting watch at version %v3173758 I1211 07:35:22.598357 1 controller.go:98] starts running from watch version: 3173758 I1211 07:35:22.599350 1 controller.go:312] start watching at 3173758 I1211 07:36:27.103749 1 controller.go:319] apiserver closed stream I1211 07:36:27.104852 1 controller.go:312] start watching at 3173758 I1211 07:38:06.335705 1 controller.go:319] apiserver closed stream I1211 07:38:06.336917 1 controller.go:312] start watching at 3173758 I1211 07:39:51.633356 1 controller.go:319] apiserver closed stream I1211 07:39:51.639361 1 controller.go:312] start watching at 3173758 I1211 07:39:52.168300 1 controller.go:173] finding existing jobs... I1211 07:39:52.171727 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:39:52.171743 1 controller.go:89] Starting watch at version %v3174279 I1211 07:39:52.171746 1 controller.go:98] starts running from watch version: 3174279 I1211 07:39:52.172915 1 controller.go:312] start watching at 3174279 I1211 07:41:29.099891 1 controller.go:319] apiserver closed stream I1211 07:41:29.101010 1 controller.go:312] start watching at 3174279 I1211 07:42:29.280871 1 controller.go:319] apiserver closed stream I1211 07:42:29.282452 1 controller.go:312] start watching at 3174279 I1211 07:44:24.551692 1 controller.go:319] apiserver closed stream I1211 07:44:24.552706 1 controller.go:312] start watching at 3174279 I1211 07:46:23.702940 1 controller.go:319] apiserver closed stream I1211 07:46:23.707586 1 controller.go:312] start watching at 3174279 I1211 07:46:24.216766 1 controller.go:173] finding existing jobs... I1211 07:46:24.219293 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:46:24.219312 1 controller.go:89] Starting watch at version %v3175043 I1211 07:46:24.219315 1 controller.go:98] starts running from watch version: 3175043 I1211 07:46:24.220193 1 controller.go:312] start watching at 3175043 I1211 07:47:55.324439 1 controller.go:319] apiserver closed stream I1211 07:47:55.325803 1 controller.go:312] start watching at 3175043 I1211 07:49:03.165434 1 controller.go:319] apiserver closed stream I1211 07:49:03.166396 1 controller.go:312] start watching at 3175043 I1211 07:50:32.287577 1 controller.go:319] apiserver closed stream I1211 07:50:32.291334 1 controller.go:312] start watching at 3175043 I1211 07:50:32.807475 1 controller.go:173] finding existing jobs... I1211 07:50:32.809727 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:50:32.809747 1 controller.go:89] Starting watch at version %v3175531 I1211 07:50:32.809751 1 controller.go:98] starts running from watch version: 3175531 I1211 07:50:32.810641 1 controller.go:312] start watching at 3175531 I1211 07:51:47.891693 1 controller.go:319] apiserver closed stream I1211 07:51:47.894117 1 controller.go:312] start watching at 3175531 I1211 07:52:52.553590 1 controller.go:319] apiserver closed stream I1211 07:52:52.554945 1 controller.go:312] start watching at 3175531 I1211 07:54:01.201719 1 controller.go:319] apiserver closed stream I1211 07:54:01.202868 1 controller.go:312] start watching at 3175531 I1211 07:55:23.708428 1 controller.go:319] apiserver closed stream I1211 07:55:23.712373 1 controller.go:312] start watching at 3175531 I1211 07:55:24.220803 1 controller.go:173] finding existing jobs... I1211 07:55:24.223369 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 07:55:24.223386 1 controller.go:89] Starting watch at version %v3176102 I1211 07:55:24.223390 1 controller.go:98] starts running from watch version: 3176102 I1211 07:55:24.224199 1 controller.go:312] start watching at 3176102 I1211 07:56:47.122851 1 controller.go:319] apiserver closed stream I1211 07:56:47.125418 1 controller.go:312] start watching at 3176102 I1211 07:58:11.317618 1 controller.go:319] apiserver closed stream I1211 07:58:11.321090 1 controller.go:312] start watching at 3176102 I1211 07:59:35.060102 1 controller.go:319] apiserver closed stream I1211 07:59:35.062696 1 controller.go:312] start watching at 3176102 I1211 08:01:25.950848 1 controller.go:319] apiserver closed stream I1211 08:01:25.954862 1 controller.go:312] start watching at 3176102 I1211 08:01:26.475588 1 controller.go:173] finding existing jobs... I1211 08:01:26.495810 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:01:26.495843 1 controller.go:89] Starting watch at version %v3176807 I1211 08:01:26.495847 1 controller.go:98] starts running from watch version: 3176807 I1211 08:01:26.496982 1 controller.go:312] start watching at 3176807 I1211 08:03:22.010650 1 controller.go:319] apiserver closed stream I1211 08:03:22.014099 1 controller.go:312] start watching at 3176807 I1211 08:05:05.891663 1 controller.go:319] apiserver closed stream I1211 08:05:05.896179 1 controller.go:312] start watching at 3176807 I1211 08:05:06.404856 1 controller.go:173] finding existing jobs... I1211 08:05:06.407033 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:05:06.407053 1 controller.go:89] Starting watch at version %v3177238 I1211 08:05:06.407057 1 controller.go:98] starts running from watch version: 3177238 I1211 08:05:06.407986 1 controller.go:312] start watching at 3177238 I1211 08:06:23.786594 1 controller.go:319] apiserver closed stream I1211 08:06:23.787645 1 controller.go:312] start watching at 3177238 I1211 08:08:23.468423 1 controller.go:319] apiserver closed stream I1211 08:08:23.469311 1 controller.go:312] start watching at 3177238 I1211 08:09:24.290331 1 controller.go:319] apiserver closed stream I1211 08:09:24.291955 1 controller.go:312] start watching at 3177238 I1211 08:11:03.856965 1 controller.go:319] apiserver closed stream I1211 08:11:03.861265 1 controller.go:312] start watching at 3177238 I1211 08:11:04.371715 1 controller.go:173] finding existing jobs... I1211 08:11:04.374682 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:11:04.374713 1 controller.go:89] Starting watch at version %v3177936 I1211 08:11:04.374717 1 controller.go:98] starts running from watch version: 3177936 I1211 08:11:04.375559 1 controller.go:312] start watching at 3177936 I1211 08:12:22.433530 1 controller.go:319] apiserver closed stream I1211 08:12:22.436027 1 controller.go:312] start watching at 3177936 I1211 08:13:56.670684 1 controller.go:319] apiserver closed stream I1211 08:13:56.673202 1 controller.go:312] start watching at 3177936 I1211 08:15:21.862248 1 controller.go:319] apiserver closed stream I1211 08:15:21.866141 1 controller.go:312] start watching at 3177936 I1211 08:15:22.376513 1 controller.go:173] finding existing jobs... I1211 08:15:22.379814 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:15:22.379883 1 controller.go:89] Starting watch at version %v3178439 I1211 08:15:22.379887 1 controller.go:98] starts running from watch version: 3178439 I1211 08:15:22.381006 1 controller.go:312] start watching at 3178439 I1211 08:16:26.531068 1 controller.go:319] apiserver closed stream I1211 08:16:26.532387 1 controller.go:312] start watching at 3178439 I1211 08:18:26.001825 1 controller.go:319] apiserver closed stream I1211 08:18:26.003079 1 controller.go:312] start watching at 3178439 I1211 08:19:29.961412 1 controller.go:319] apiserver closed stream I1211 08:19:29.962540 1 controller.go:312] start watching at 3178439 I1211 08:21:27.087544 1 controller.go:319] apiserver closed stream I1211 08:21:27.092196 1 controller.go:312] start watching at 3178439 I1211 08:21:27.601387 1 controller.go:173] finding existing jobs... I1211 08:21:27.604352 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:21:27.604370 1 controller.go:89] Starting watch at version %v3179147 I1211 08:21:27.604373 1 controller.go:98] starts running from watch version: 3179147 I1211 08:21:27.605922 1 controller.go:312] start watching at 3179147 I1211 08:22:42.282438 1 controller.go:319] apiserver closed stream I1211 08:22:42.283550 1 controller.go:312] start watching at 3179147 I1211 08:24:15.992059 1 controller.go:319] apiserver closed stream I1211 08:24:15.993465 1 controller.go:312] start watching at 3179147 I1211 08:25:16.505920 1 controller.go:319] apiserver closed stream I1211 08:25:16.509887 1 controller.go:312] start watching at 3179147 I1211 08:25:17.018183 1 controller.go:173] finding existing jobs... I1211 08:25:17.022079 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:25:17.022113 1 controller.go:89] Starting watch at version %v3179590 I1211 08:25:17.022116 1 controller.go:98] starts running from watch version: 3179590 I1211 08:25:17.027102 1 controller.go:312] start watching at 3179590 I1211 08:26:43.080925 1 controller.go:319] apiserver closed stream I1211 08:26:43.082486 1 controller.go:312] start watching at 3179590 I1211 08:28:15.417344 1 controller.go:319] apiserver closed stream I1211 08:28:15.419550 1 controller.go:312] start watching at 3179590 I1211 08:29:30.826733 1 controller.go:319] apiserver closed stream I1211 08:29:30.827914 1 controller.go:312] start watching at 3179590 I1211 08:31:23.121408 1 controller.go:319] apiserver closed stream I1211 08:31:23.125897 1 controller.go:312] start watching at 3179590 I1211 08:31:23.638150 1 controller.go:173] finding existing jobs... I1211 08:31:23.641976 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:31:23.641995 1 controller.go:89] Starting watch at version %v3180308 I1211 08:31:23.641999 1 controller.go:98] starts running from watch version: 3180308 I1211 08:31:23.644273 1 controller.go:312] start watching at 3180308 I1211 08:33:06.074447 1 controller.go:319] apiserver closed stream I1211 08:33:06.075570 1 controller.go:312] start watching at 3180308 I1211 08:34:57.678029 1 controller.go:319] apiserver closed stream I1211 08:34:57.683199 1 controller.go:312] start watching at 3180308 I1211 08:34:58.191433 1 controller.go:173] finding existing jobs... I1211 08:34:58.193551 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:34:58.193576 1 controller.go:89] Starting watch at version %v3180727 I1211 08:34:58.193579 1 controller.go:98] starts running from watch version: 3180727 I1211 08:34:58.194380 1 controller.go:312] start watching at 3180727 I1211 08:36:47.678458 1 controller.go:319] apiserver closed stream I1211 08:36:47.679551 1 controller.go:312] start watching at 3180727 I1211 08:38:22.624830 1 controller.go:319] apiserver closed stream I1211 08:38:22.626131 1 controller.go:312] start watching at 3180727 I1211 08:39:47.754307 1 controller.go:319] apiserver closed stream I1211 08:39:47.755642 1 controller.go:312] start watching at 3180727 I1211 08:40:52.384780 1 controller.go:319] apiserver closed stream I1211 08:40:52.388684 1 controller.go:312] start watching at 3180727 I1211 08:40:52.897172 1 controller.go:173] finding existing jobs... I1211 08:40:52.899669 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:40:52.899729 1 controller.go:89] Starting watch at version %v3181420 I1211 08:40:52.899739 1 controller.go:98] starts running from watch version: 3181420 I1211 08:40:52.900653 1 controller.go:312] start watching at 3181420 I1211 08:42:26.965779 1 controller.go:319] apiserver closed stream I1211 08:42:26.971482 1 controller.go:312] start watching at 3181420 I1211 08:44:24.433013 1 controller.go:319] apiserver closed stream I1211 08:44:24.434129 1 controller.go:312] start watching at 3181420 I1211 08:45:50.984464 1 controller.go:319] apiserver closed stream I1211 08:45:50.988195 1 controller.go:312] start watching at 3181420 I1211 08:45:51.496552 1 controller.go:173] finding existing jobs... I1211 08:45:51.499143 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:45:51.499161 1 controller.go:89] Starting watch at version %v3182004 I1211 08:45:51.499164 1 controller.go:98] starts running from watch version: 3182004 I1211 08:45:51.499947 1 controller.go:312] start watching at 3182004 I1211 08:47:33.212777 1 controller.go:319] apiserver closed stream I1211 08:47:33.218127 1 controller.go:312] start watching at 3182004 I1211 08:49:05.311936 1 controller.go:319] apiserver closed stream I1211 08:49:05.313125 1 controller.go:312] start watching at 3182004 I1211 08:50:17.504997 1 controller.go:319] apiserver closed stream I1211 08:50:17.508808 1 controller.go:312] start watching at 3182004 I1211 08:50:18.018546 1 controller.go:173] finding existing jobs... I1211 08:50:18.021297 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:50:18.021321 1 controller.go:89] Starting watch at version %v3182518 I1211 08:50:18.021325 1 controller.go:98] starts running from watch version: 3182518 I1211 08:50:18.023287 1 controller.go:312] start watching at 3182518 I1211 08:51:47.660476 1 controller.go:319] apiserver closed stream I1211 08:51:47.661751 1 controller.go:312] start watching at 3182518 I1211 08:53:05.919417 1 controller.go:319] apiserver closed stream I1211 08:53:05.920593 1 controller.go:312] start watching at 3182518 I1211 08:54:25.309488 1 controller.go:319] apiserver closed stream I1211 08:54:25.311511 1 controller.go:312] start watching at 3182518 I1211 08:55:53.462936 1 controller.go:319] apiserver closed stream I1211 08:55:53.467236 1 controller.go:312] start watching at 3182518 I1211 08:55:53.980448 1 controller.go:173] finding existing jobs... I1211 08:55:53.982659 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 08:55:53.982679 1 controller.go:89] Starting watch at version %v3183172 I1211 08:55:53.982683 1 controller.go:98] starts running from watch version: 3183172 I1211 08:55:53.983898 1 controller.go:312] start watching at 3183172 I1211 08:57:46.929425 1 controller.go:319] apiserver closed stream I1211 08:57:46.930677 1 controller.go:312] start watching at 3183172 I1211 08:59:15.264327 1 controller.go:319] apiserver closed stream I1211 08:59:15.265596 1 controller.go:312] start watching at 3183172 I1211 09:00:50.095891 1 controller.go:319] apiserver closed stream I1211 09:00:50.099749 1 controller.go:312] start watching at 3183172 I1211 09:00:50.608619 1 controller.go:173] finding existing jobs... I1211 09:00:50.611183 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:00:50.611238 1 controller.go:89] Starting watch at version %v3183751 I1211 09:00:50.611242 1 controller.go:98] starts running from watch version: 3183751 I1211 09:00:50.612171 1 controller.go:312] start watching at 3183751 I1211 09:02:01.390786 1 controller.go:319] apiserver closed stream I1211 09:02:01.392094 1 controller.go:312] start watching at 3183751 I1211 09:03:18.782902 1 controller.go:319] apiserver closed stream I1211 09:03:18.784195 1 controller.go:312] start watching at 3183751 I1211 09:04:31.089666 1 controller.go:319] apiserver closed stream I1211 09:04:31.090938 1 controller.go:312] start watching at 3183751 I1211 09:05:32.885538 1 controller.go:319] apiserver closed stream I1211 09:05:32.888681 1 controller.go:312] start watching at 3183751 I1211 09:05:33.397543 1 controller.go:173] finding existing jobs... I1211 09:05:33.399822 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:05:33.399838 1 controller.go:89] Starting watch at version %v3184300 I1211 09:05:33.399841 1 controller.go:98] starts running from watch version: 3184300 I1211 09:05:33.400764 1 controller.go:312] start watching at 3184300 I1211 09:07:27.716017 1 controller.go:319] apiserver closed stream I1211 09:07:27.717024 1 controller.go:312] start watching at 3184300 I1211 09:09:07.913869 1 controller.go:319] apiserver closed stream I1211 09:09:07.915131 1 controller.go:312] start watching at 3184300 I1211 09:10:35.594738 1 controller.go:319] apiserver closed stream I1211 09:10:35.599471 1 controller.go:312] start watching at 3184300 I1211 09:10:36.108674 1 controller.go:173] finding existing jobs... I1211 09:10:36.110887 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:10:36.110903 1 controller.go:89] Starting watch at version %v3184888 I1211 09:10:36.110907 1 controller.go:98] starts running from watch version: 3184888 I1211 09:10:36.111896 1 controller.go:312] start watching at 3184888 I1211 09:11:51.993012 1 controller.go:319] apiserver closed stream I1211 09:11:51.994431 1 controller.go:312] start watching at 3184888 I1211 09:12:57.195616 1 controller.go:319] apiserver closed stream I1211 09:12:57.198185 1 controller.go:312] start watching at 3184888 I1211 09:14:46.513198 1 controller.go:319] apiserver closed stream I1211 09:14:46.514342 1 controller.go:312] start watching at 3184888 I1211 09:16:36.887960 1 controller.go:319] apiserver closed stream I1211 09:16:36.889237 1 controller.go:312] start watching at 3184888 I1211 09:16:37.397736 1 controller.go:173] finding existing jobs... I1211 09:16:37.400339 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:16:37.400357 1 controller.go:89] Starting watch at version %v3185591 I1211 09:16:37.400361 1 controller.go:98] starts running from watch version: 3185591 I1211 09:16:37.401177 1 controller.go:312] start watching at 3185591 I1211 09:18:29.412098 1 controller.go:319] apiserver closed stream I1211 09:18:29.413156 1 controller.go:312] start watching at 3185591 I1211 09:19:56.676131 1 controller.go:319] apiserver closed stream I1211 09:19:56.716002 1 controller.go:312] start watching at 3185591 I1211 09:19:57.224800 1 controller.go:173] finding existing jobs... I1211 09:19:57.227421 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:19:57.227459 1 controller.go:89] Starting watch at version %v3185990 I1211 09:19:57.227462 1 controller.go:98] starts running from watch version: 3185990 I1211 09:19:57.228358 1 controller.go:312] start watching at 3185990 I1211 09:21:47.565339 1 controller.go:319] apiserver closed stream I1211 09:21:47.566627 1 controller.go:312] start watching at 3185990 I1211 09:22:55.717745 1 controller.go:319] apiserver closed stream I1211 09:22:55.721289 1 controller.go:312] start watching at 3185990 I1211 09:24:43.111266 1 controller.go:319] apiserver closed stream I1211 09:24:43.112280 1 controller.go:312] start watching at 3185990 I1211 09:26:37.689007 1 controller.go:319] apiserver closed stream I1211 09:26:37.690042 1 controller.go:312] start watching at 3185990 I1211 09:26:38.198890 1 controller.go:173] finding existing jobs... I1211 09:26:38.201346 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:26:38.201380 1 controller.go:89] Starting watch at version %v3186766 I1211 09:26:38.201412 1 controller.go:98] starts running from watch version: 3186766 I1211 09:26:38.202412 1 controller.go:312] start watching at 3186766 I1211 09:28:06.628087 1 controller.go:319] apiserver closed stream I1211 09:28:06.632416 1 controller.go:312] start watching at 3186766 I1211 09:29:49.867964 1 controller.go:319] apiserver closed stream I1211 09:29:49.869270 1 controller.go:312] start watching at 3186766 I1211 09:31:28.612341 1 controller.go:319] apiserver closed stream I1211 09:31:28.616786 1 controller.go:312] start watching at 3186766 I1211 09:31:29.126088 1 controller.go:173] finding existing jobs... I1211 09:31:29.128673 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:31:29.128691 1 controller.go:89] Starting watch at version %v3187334 I1211 09:31:29.128694 1 controller.go:98] starts running from watch version: 3187334 I1211 09:31:29.129600 1 controller.go:312] start watching at 3187334 I1211 09:33:23.983706 1 controller.go:319] apiserver closed stream I1211 09:33:23.984791 1 controller.go:312] start watching at 3187334 I1211 09:34:42.951263 1 controller.go:319] apiserver closed stream I1211 09:34:42.952363 1 controller.go:312] start watching at 3187334 I1211 09:36:25.428200 1 controller.go:319] apiserver closed stream I1211 09:36:25.431786 1 controller.go:312] start watching at 3187334 I1211 09:36:25.955120 1 controller.go:173] finding existing jobs... I1211 09:36:25.957806 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:36:25.957822 1 controller.go:89] Starting watch at version %v3187916 I1211 09:36:25.957826 1 controller.go:98] starts running from watch version: 3187916 I1211 09:36:25.978132 1 controller.go:312] start watching at 3187916 I1211 09:37:46.061671 1 controller.go:319] apiserver closed stream I1211 09:37:46.063353 1 controller.go:312] start watching at 3187916 I1211 09:39:15.942498 1 controller.go:319] apiserver closed stream I1211 09:39:15.945994 1 controller.go:312] start watching at 3187916 I1211 09:41:10.250653 1 controller.go:319] apiserver closed stream I1211 09:41:10.267436 1 controller.go:312] start watching at 3187916 I1211 09:41:10.821125 1 controller.go:173] finding existing jobs... I1211 09:41:10.841111 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:41:10.841131 1 controller.go:89] Starting watch at version %v3188468 I1211 09:41:10.841135 1 controller.go:98] starts running from watch version: 3188468 I1211 09:41:10.842057 1 controller.go:312] start watching at 3188468 I1211 09:43:09.118282 1 controller.go:319] apiserver closed stream I1211 09:43:09.119393 1 controller.go:312] start watching at 3188468 I1211 09:44:56.587248 1 controller.go:319] apiserver closed stream I1211 09:44:56.590969 1 controller.go:312] start watching at 3188468 I1211 09:44:57.099704 1 controller.go:173] finding existing jobs... I1211 09:44:57.101971 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:44:57.102004 1 controller.go:89] Starting watch at version %v3188910 I1211 09:44:57.102007 1 controller.go:98] starts running from watch version: 3188910 I1211 09:44:57.102887 1 controller.go:312] start watching at 3188910 I1211 09:46:17.744660 1 controller.go:319] apiserver closed stream I1211 09:46:17.747073 1 controller.go:312] start watching at 3188910 I1211 09:47:28.690353 1 controller.go:319] apiserver closed stream I1211 09:47:28.691408 1 controller.go:312] start watching at 3188910 I1211 09:49:21.218662 1 controller.go:319] apiserver closed stream I1211 09:49:21.219993 1 controller.go:312] start watching at 3188910 I1211 09:51:19.404088 1 controller.go:319] apiserver closed stream I1211 09:51:19.408411 1 controller.go:312] start watching at 3188910 I1211 09:51:19.917655 1 controller.go:173] finding existing jobs... I1211 09:51:19.920323 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:51:19.920339 1 controller.go:89] Starting watch at version %v3189657 I1211 09:51:19.920343 1 controller.go:98] starts running from watch version: 3189657 I1211 09:51:19.921974 1 controller.go:312] start watching at 3189657 I1211 09:52:24.437949 1 controller.go:319] apiserver closed stream I1211 09:52:24.439109 1 controller.go:312] start watching at 3189657 I1211 09:54:21.295921 1 controller.go:319] apiserver closed stream I1211 09:54:21.297107 1 controller.go:312] start watching at 3189657 I1211 09:55:36.598835 1 controller.go:319] apiserver closed stream I1211 09:55:36.602750 1 controller.go:312] start watching at 3189657 I1211 09:55:37.112032 1 controller.go:173] finding existing jobs... I1211 09:55:37.114638 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 09:55:37.114658 1 controller.go:89] Starting watch at version %v3190158 I1211 09:55:37.114662 1 controller.go:98] starts running from watch version: 3190158 I1211 09:55:37.116973 1 controller.go:312] start watching at 3190158 I1211 09:56:58.970836 1 controller.go:319] apiserver closed stream I1211 09:56:58.972035 1 controller.go:312] start watching at 3190158 I1211 09:58:14.492329 1 controller.go:319] apiserver closed stream I1211 09:58:14.493276 1 controller.go:312] start watching at 3190158 I1211 09:59:39.601487 1 controller.go:319] apiserver closed stream I1211 09:59:39.602801 1 controller.go:312] start watching at 3190158 I1211 10:01:27.853235 1 controller.go:319] apiserver closed stream I1211 10:01:27.857277 1 controller.go:312] start watching at 3190158 I1211 10:01:28.366445 1 controller.go:173] finding existing jobs... I1211 10:01:28.369290 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:01:28.369308 1 controller.go:89] Starting watch at version %v3190840 I1211 10:01:28.369311 1 controller.go:98] starts running from watch version: 3190840 I1211 10:01:28.370667 1 controller.go:312] start watching at 3190840 I1211 10:03:17.335060 1 controller.go:319] apiserver closed stream I1211 10:03:17.337346 1 controller.go:312] start watching at 3190840 I1211 10:04:37.988372 1 controller.go:319] apiserver closed stream I1211 10:04:37.989515 1 controller.go:312] start watching at 3190840 I1211 10:06:10.062100 1 controller.go:319] apiserver closed stream I1211 10:06:10.065937 1 controller.go:312] start watching at 3190840 I1211 10:06:10.574365 1 controller.go:173] finding existing jobs... I1211 10:06:10.577441 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:06:10.577459 1 controller.go:89] Starting watch at version %v3191391 I1211 10:06:10.577463 1 controller.go:98] starts running from watch version: 3191391 I1211 10:06:10.578358 1 controller.go:312] start watching at 3191391 I1211 10:07:23.734355 1 controller.go:319] apiserver closed stream I1211 10:07:23.735758 1 controller.go:312] start watching at 3191391 I1211 10:09:02.777315 1 controller.go:319] apiserver closed stream I1211 10:09:02.778432 1 controller.go:312] start watching at 3191391 I1211 10:10:44.946908 1 controller.go:319] apiserver closed stream I1211 10:10:44.950544 1 controller.go:312] start watching at 3191391 I1211 10:10:45.459046 1 controller.go:173] finding existing jobs... I1211 10:10:45.461662 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:10:45.461691 1 controller.go:89] Starting watch at version %v3191927 I1211 10:10:45.461695 1 controller.go:98] starts running from watch version: 3191927 I1211 10:10:45.462594 1 controller.go:312] start watching at 3191927 I1211 10:12:08.814369 1 controller.go:319] apiserver closed stream I1211 10:12:08.815513 1 controller.go:312] start watching at 3191927 I1211 10:14:01.376957 1 controller.go:319] apiserver closed stream I1211 10:14:01.378142 1 controller.go:312] start watching at 3191927 I1211 10:15:06.944198 1 controller.go:319] apiserver closed stream I1211 10:15:06.948888 1 controller.go:312] start watching at 3191927 I1211 10:15:07.457389 1 controller.go:173] finding existing jobs... I1211 10:15:07.459611 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:15:07.459627 1 controller.go:89] Starting watch at version %v3192432 I1211 10:15:07.459631 1 controller.go:98] starts running from watch version: 3192432 I1211 10:15:07.460517 1 controller.go:312] start watching at 3192432 I1211 10:16:51.728009 1 controller.go:319] apiserver closed stream I1211 10:16:51.729203 1 controller.go:312] start watching at 3192432 I1211 10:18:27.731655 1 controller.go:319] apiserver closed stream I1211 10:18:27.732683 1 controller.go:312] start watching at 3192432 I1211 10:20:18.013558 1 controller.go:319] apiserver closed stream I1211 10:20:18.019971 1 controller.go:312] start watching at 3192432 I1211 10:20:18.538180 1 controller.go:173] finding existing jobs... I1211 10:20:18.553577 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:20:18.553607 1 controller.go:89] Starting watch at version %v3193042 I1211 10:20:18.553610 1 controller.go:98] starts running from watch version: 3193042 I1211 10:20:18.554720 1 controller.go:312] start watching at 3193042 I1211 10:21:23.898387 1 controller.go:319] apiserver closed stream I1211 10:21:23.899574 1 controller.go:312] start watching at 3193042 I1211 10:23:11.790584 1 controller.go:319] apiserver closed stream I1211 10:23:11.792520 1 controller.go:312] start watching at 3193042 I1211 10:24:51.119358 1 controller.go:319] apiserver closed stream I1211 10:24:51.120431 1 controller.go:312] start watching at 3193042 I1211 10:26:34.733413 1 controller.go:319] apiserver closed stream I1211 10:26:34.736934 1 controller.go:312] start watching at 3193042 I1211 10:26:35.252333 1 controller.go:173] finding existing jobs... I1211 10:26:35.255473 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:26:35.255489 1 controller.go:89] Starting watch at version %v3193779 I1211 10:26:35.255492 1 controller.go:98] starts running from watch version: 3193779 I1211 10:26:35.256510 1 controller.go:312] start watching at 3193779 I1211 10:27:51.563113 1 controller.go:319] apiserver closed stream I1211 10:27:51.564795 1 controller.go:312] start watching at 3193779 I1211 10:29:20.439988 1 controller.go:319] apiserver closed stream I1211 10:29:20.442124 1 controller.go:312] start watching at 3193779 I1211 10:31:13.609032 1 controller.go:319] apiserver closed stream I1211 10:31:13.613336 1 controller.go:312] start watching at 3193779 I1211 10:31:14.122744 1 controller.go:173] finding existing jobs... I1211 10:31:14.125575 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:31:14.125595 1 controller.go:89] Starting watch at version %v3194320 I1211 10:31:14.125599 1 controller.go:98] starts running from watch version: 3194320 I1211 10:31:14.126423 1 controller.go:312] start watching at 3194320 I1211 10:32:50.991659 1 controller.go:319] apiserver closed stream I1211 10:32:50.992770 1 controller.go:312] start watching at 3194320 I1211 10:34:43.151199 1 controller.go:319] apiserver closed stream I1211 10:34:43.154114 1 controller.go:312] start watching at 3194320 I1211 10:36:19.755202 1 controller.go:319] apiserver closed stream I1211 10:36:19.777647 1 controller.go:312] start watching at 3194320 I1211 10:36:20.289731 1 controller.go:173] finding existing jobs... I1211 10:36:20.292581 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:36:20.292599 1 controller.go:89] Starting watch at version %v3194917 I1211 10:36:20.292602 1 controller.go:98] starts running from watch version: 3194917 I1211 10:36:20.293498 1 controller.go:312] start watching at 3194917 I1211 10:38:17.018277 1 controller.go:319] apiserver closed stream I1211 10:38:17.020989 1 controller.go:312] start watching at 3194917 I1211 10:39:17.890880 1 controller.go:319] apiserver closed stream I1211 10:39:17.892139 1 controller.go:312] start watching at 3194917 I1211 10:40:47.925772 1 controller.go:319] apiserver closed stream I1211 10:40:47.929342 1 controller.go:312] start watching at 3194917 I1211 10:40:48.437730 1 controller.go:173] finding existing jobs... I1211 10:40:48.440164 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:40:48.440184 1 controller.go:89] Starting watch at version %v3195441 I1211 10:40:48.440187 1 controller.go:98] starts running from watch version: 3195441 I1211 10:40:48.441125 1 controller.go:312] start watching at 3195441 I1211 10:42:12.309586 1 controller.go:319] apiserver closed stream I1211 10:42:12.311065 1 controller.go:312] start watching at 3195441 I1211 10:43:32.574727 1 controller.go:319] apiserver closed stream I1211 10:43:32.575889 1 controller.go:312] start watching at 3195441 I1211 10:44:50.485568 1 controller.go:319] apiserver closed stream I1211 10:44:50.486612 1 controller.go:312] start watching at 3195441 I1211 10:46:19.650521 1 controller.go:319] apiserver closed stream I1211 10:46:19.651533 1 controller.go:312] start watching at 3195441 I1211 10:46:20.161491 1 controller.go:173] finding existing jobs... I1211 10:46:20.177513 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:46:20.177535 1 controller.go:89] Starting watch at version %v3196083 I1211 10:46:20.177539 1 controller.go:98] starts running from watch version: 3196083 I1211 10:46:20.178714 1 controller.go:312] start watching at 3196083 I1211 10:47:34.047969 1 controller.go:319] apiserver closed stream I1211 10:47:34.049194 1 controller.go:312] start watching at 3196083 I1211 10:48:53.875267 1 controller.go:319] apiserver closed stream I1211 10:48:53.880529 1 controller.go:312] start watching at 3196083 I1211 10:50:10.449742 1 controller.go:319] apiserver closed stream I1211 10:50:10.454090 1 controller.go:312] start watching at 3196083 I1211 10:50:10.963046 1 controller.go:173] finding existing jobs... I1211 10:50:10.965292 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:50:10.965308 1 controller.go:89] Starting watch at version %v3196535 I1211 10:50:10.965311 1 controller.go:98] starts running from watch version: 3196535 I1211 10:50:10.966172 1 controller.go:312] start watching at 3196535 I1211 10:51:21.981183 1 controller.go:319] apiserver closed stream I1211 10:51:21.982704 1 controller.go:312] start watching at 3196535 I1211 10:53:17.767722 1 controller.go:319] apiserver closed stream I1211 10:53:17.770038 1 controller.go:312] start watching at 3196535 I1211 10:54:49.532615 1 controller.go:319] apiserver closed stream I1211 10:54:49.533694 1 controller.go:312] start watching at 3196535 I1211 10:56:32.269243 1 controller.go:319] apiserver closed stream I1211 10:56:32.272965 1 controller.go:312] start watching at 3196535 I1211 10:56:32.781557 1 controller.go:173] finding existing jobs... I1211 10:56:32.784242 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 10:56:32.784259 1 controller.go:89] Starting watch at version %v3197281 I1211 10:56:32.784262 1 controller.go:98] starts running from watch version: 3197281 I1211 10:56:32.785141 1 controller.go:312] start watching at 3197281 I1211 10:57:36.870844 1 controller.go:319] apiserver closed stream I1211 10:57:36.872019 1 controller.go:312] start watching at 3197281 I1211 10:58:58.935458 1 controller.go:319] apiserver closed stream I1211 10:58:58.939275 1 controller.go:312] start watching at 3197281 I1211 11:00:34.641178 1 controller.go:319] apiserver closed stream I1211 11:00:34.642528 1 controller.go:312] start watching at 3197281 I1211 11:00:35.169944 1 controller.go:173] finding existing jobs... I1211 11:00:35.172745 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:00:35.172763 1 controller.go:89] Starting watch at version %v3197750 I1211 11:00:35.172767 1 controller.go:98] starts running from watch version: 3197750 I1211 11:00:35.173659 1 controller.go:312] start watching at 3197750 I1211 11:02:23.466410 1 controller.go:319] apiserver closed stream I1211 11:02:23.467533 1 controller.go:312] start watching at 3197750 I1211 11:03:27.523284 1 controller.go:319] apiserver closed stream I1211 11:03:27.524484 1 controller.go:312] start watching at 3197750 I1211 11:05:22.720188 1 controller.go:319] apiserver closed stream I1211 11:05:22.724389 1 controller.go:312] start watching at 3197750 I1211 11:05:23.233493 1 controller.go:173] finding existing jobs... I1211 11:05:23.235897 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:05:23.235940 1 controller.go:89] Starting watch at version %v3198314 I1211 11:05:23.235944 1 controller.go:98] starts running from watch version: 3198314 I1211 11:05:23.236943 1 controller.go:312] start watching at 3198314 I1211 11:07:08.302191 1 controller.go:319] apiserver closed stream I1211 11:07:08.303481 1 controller.go:312] start watching at 3198314 I1211 11:08:42.685521 1 controller.go:319] apiserver closed stream I1211 11:08:42.688117 1 controller.go:312] start watching at 3198314 I1211 11:10:20.212789 1 controller.go:319] apiserver closed stream I1211 11:10:20.216197 1 controller.go:312] start watching at 3198314 I1211 11:10:20.725192 1 controller.go:173] finding existing jobs... I1211 11:10:20.728648 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:10:20.728665 1 controller.go:89] Starting watch at version %v3198897 I1211 11:10:20.728668 1 controller.go:98] starts running from watch version: 3198897 I1211 11:10:20.729684 1 controller.go:312] start watching at 3198897 I1211 11:11:21.913574 1 controller.go:319] apiserver closed stream I1211 11:11:21.914580 1 controller.go:312] start watching at 3198897 I1211 11:12:31.769730 1 controller.go:319] apiserver closed stream I1211 11:12:31.772360 1 controller.go:312] start watching at 3198897 I1211 11:13:50.943132 1 controller.go:319] apiserver closed stream I1211 11:13:50.944186 1 controller.go:312] start watching at 3198897 I1211 11:15:17.671380 1 controller.go:319] apiserver closed stream I1211 11:15:17.675444 1 controller.go:312] start watching at 3198897 I1211 11:15:18.185052 1 controller.go:173] finding existing jobs... I1211 11:15:18.187729 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:15:18.187779 1 controller.go:89] Starting watch at version %v3199479 I1211 11:15:18.187783 1 controller.go:98] starts running from watch version: 3199479 I1211 11:15:18.188724 1 controller.go:312] start watching at 3199479 I1211 11:16:55.658134 1 controller.go:319] apiserver closed stream I1211 11:16:55.660508 1 controller.go:312] start watching at 3199479 I1211 11:18:28.122745 1 controller.go:319] apiserver closed stream I1211 11:18:28.125178 1 controller.go:312] start watching at 3199479 I1211 11:19:38.315072 1 controller.go:319] apiserver closed stream I1211 11:19:38.317380 1 controller.go:312] start watching at 3199479 I1211 11:20:52.402670 1 controller.go:319] apiserver closed stream I1211 11:20:52.403875 1 controller.go:312] start watching at 3199479 I1211 11:20:52.912516 1 controller.go:173] finding existing jobs... I1211 11:20:52.915076 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:20:52.915089 1 controller.go:89] Starting watch at version %v3200129 I1211 11:20:52.915092 1 controller.go:98] starts running from watch version: 3200129 I1211 11:20:52.916060 1 controller.go:312] start watching at 3200129 I1211 11:22:06.068811 1 controller.go:319] apiserver closed stream I1211 11:22:06.070279 1 controller.go:312] start watching at 3200129 I1211 11:23:56.876862 1 controller.go:319] apiserver closed stream I1211 11:23:56.878166 1 controller.go:312] start watching at 3200129 I1211 11:25:21.026937 1 controller.go:319] apiserver closed stream I1211 11:25:21.034324 1 controller.go:312] start watching at 3200129 I1211 11:25:21.548237 1 controller.go:173] finding existing jobs... I1211 11:25:21.551862 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:25:21.551878 1 controller.go:89] Starting watch at version %v3200653 I1211 11:25:21.551882 1 controller.go:98] starts running from watch version: 3200653 I1211 11:25:21.552863 1 controller.go:312] start watching at 3200653 I1211 11:26:32.769813 1 controller.go:319] apiserver closed stream I1211 11:26:32.770954 1 controller.go:312] start watching at 3200653 I1211 11:28:25.518775 1 controller.go:319] apiserver closed stream I1211 11:28:25.520861 1 controller.go:312] start watching at 3200653 I1211 11:30:09.789247 1 controller.go:319] apiserver closed stream I1211 11:30:09.793238 1 controller.go:312] start watching at 3200653 I1211 11:30:10.365357 1 controller.go:173] finding existing jobs... I1211 11:30:10.373208 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:30:10.373243 1 controller.go:89] Starting watch at version %v3201212 I1211 11:30:10.373247 1 controller.go:98] starts running from watch version: 3201212 I1211 11:30:10.381114 1 controller.go:312] start watching at 3201212 I1211 11:31:35.810776 1 controller.go:319] apiserver closed stream I1211 11:31:35.812012 1 controller.go:312] start watching at 3201212 I1211 11:33:18.869322 1 controller.go:319] apiserver closed stream I1211 11:33:18.870921 1 controller.go:312] start watching at 3201212 I1211 11:34:18.873410 1 controller.go:319] apiserver closed stream I1211 11:34:18.875189 1 controller.go:312] start watching at 3201212 I1211 11:36:10.785765 1 controller.go:319] apiserver closed stream I1211 11:36:10.786878 1 controller.go:312] start watching at 3201212 I1211 11:36:11.295554 1 controller.go:173] finding existing jobs... I1211 11:36:11.298430 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:36:11.298446 1 controller.go:89] Starting watch at version %v3201917 I1211 11:36:11.298449 1 controller.go:98] starts running from watch version: 3201917 I1211 11:36:11.299302 1 controller.go:312] start watching at 3201917 I1211 11:37:16.280156 1 controller.go:319] apiserver closed stream I1211 11:37:16.282236 1 controller.go:312] start watching at 3201917 I1211 11:38:36.336932 1 controller.go:319] apiserver closed stream I1211 11:38:36.338145 1 controller.go:312] start watching at 3201917 I1211 11:40:13.429980 1 controller.go:319] apiserver closed stream I1211 11:40:13.433591 1 controller.go:312] start watching at 3201917 I1211 11:40:13.971566 1 controller.go:173] finding existing jobs... I1211 11:40:13.992517 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:40:13.992531 1 controller.go:89] Starting watch at version %v3202387 I1211 11:40:13.992534 1 controller.go:98] starts running from watch version: 3202387 I1211 11:40:13.993571 1 controller.go:312] start watching at 3202387 I1211 11:41:58.778469 1 controller.go:319] apiserver closed stream I1211 11:41:58.779663 1 controller.go:312] start watching at 3202387 I1211 11:42:59.272470 1 controller.go:319] apiserver closed stream I1211 11:42:59.274916 1 controller.go:312] start watching at 3202387 I1211 11:44:34.125071 1 controller.go:319] apiserver closed stream I1211 11:44:34.126305 1 controller.go:312] start watching at 3202387 I1211 11:46:22.257161 1 controller.go:319] apiserver closed stream I1211 11:46:22.261341 1 controller.go:312] start watching at 3202387 I1211 11:46:22.775317 1 controller.go:173] finding existing jobs... I1211 11:46:22.777487 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:46:22.777503 1 controller.go:89] Starting watch at version %v3203108 I1211 11:46:22.777507 1 controller.go:98] starts running from watch version: 3203108 I1211 11:46:22.778303 1 controller.go:312] start watching at 3203108 I1211 11:47:59.640324 1 controller.go:319] apiserver closed stream I1211 11:47:59.641555 1 controller.go:312] start watching at 3203108 I1211 11:49:06.249104 1 controller.go:319] apiserver closed stream I1211 11:49:06.251893 1 controller.go:312] start watching at 3203108 I1211 11:50:58.645673 1 controller.go:319] apiserver closed stream I1211 11:50:58.646978 1 controller.go:312] start watching at 3203108 I1211 11:50:59.155456 1 controller.go:173] finding existing jobs... I1211 11:50:59.157610 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:50:59.157627 1 controller.go:89] Starting watch at version %v3203645 I1211 11:50:59.157646 1 controller.go:98] starts running from watch version: 3203645 I1211 11:50:59.158577 1 controller.go:312] start watching at 3203645 I1211 11:52:14.061924 1 controller.go:319] apiserver closed stream I1211 11:52:14.063115 1 controller.go:312] start watching at 3203645 I1211 11:53:35.389973 1 controller.go:319] apiserver closed stream I1211 11:53:35.391027 1 controller.go:312] start watching at 3203645 I1211 11:55:08.684407 1 controller.go:319] apiserver closed stream I1211 11:55:08.687974 1 controller.go:312] start watching at 3203645 I1211 11:55:09.196926 1 controller.go:173] finding existing jobs... I1211 11:55:09.199390 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 11:55:09.199410 1 controller.go:89] Starting watch at version %v3204135 I1211 11:55:09.199415 1 controller.go:98] starts running from watch version: 3204135 I1211 11:55:09.217711 1 controller.go:312] start watching at 3204135 I1211 11:56:46.029332 1 controller.go:319] apiserver closed stream I1211 11:56:46.030633 1 controller.go:312] start watching at 3204135 I1211 11:57:49.998555 1 controller.go:319] apiserver closed stream I1211 11:57:49.999702 1 controller.go:312] start watching at 3204135 I1211 11:59:14.328515 1 controller.go:319] apiserver closed stream I1211 11:59:14.329694 1 controller.go:312] start watching at 3204135 I1211 12:00:55.238153 1 controller.go:319] apiserver closed stream I1211 12:00:55.241870 1 controller.go:312] start watching at 3204135 I1211 12:00:55.752083 1 controller.go:173] finding existing jobs... I1211 12:00:55.754414 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:00:55.754432 1 controller.go:89] Starting watch at version %v3204810 I1211 12:00:55.754435 1 controller.go:98] starts running from watch version: 3204810 I1211 12:00:55.755318 1 controller.go:312] start watching at 3204810 I1211 12:02:12.854284 1 controller.go:319] apiserver closed stream I1211 12:02:12.855847 1 controller.go:312] start watching at 3204810 I1211 12:03:42.857709 1 controller.go:319] apiserver closed stream I1211 12:03:42.860094 1 controller.go:312] start watching at 3204810 I1211 12:04:55.501357 1 controller.go:319] apiserver closed stream I1211 12:04:55.506363 1 controller.go:312] start watching at 3204810 I1211 12:04:56.042110 1 controller.go:173] finding existing jobs... I1211 12:04:56.044538 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:04:56.044558 1 controller.go:89] Starting watch at version %v3205283 I1211 12:04:56.044561 1 controller.go:98] starts running from watch version: 3205283 I1211 12:04:56.045687 1 controller.go:312] start watching at 3205283 I1211 12:06:19.827424 1 controller.go:319] apiserver closed stream I1211 12:06:19.828628 1 controller.go:312] start watching at 3205283 I1211 12:07:33.287361 1 controller.go:319] apiserver closed stream I1211 12:07:33.288605 1 controller.go:312] start watching at 3205283 I1211 12:08:55.943950 1 controller.go:319] apiserver closed stream I1211 12:08:55.946334 1 controller.go:312] start watching at 3205283 I1211 12:10:11.625180 1 controller.go:319] apiserver closed stream I1211 12:10:11.629314 1 controller.go:312] start watching at 3205283 I1211 12:10:12.139199 1 controller.go:173] finding existing jobs... I1211 12:10:12.141409 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:10:12.141424 1 controller.go:89] Starting watch at version %v3205894 I1211 12:10:12.141427 1 controller.go:98] starts running from watch version: 3205894 I1211 12:10:12.142295 1 controller.go:312] start watching at 3205894 I1211 12:11:23.001964 1 controller.go:319] apiserver closed stream I1211 12:11:23.003235 1 controller.go:312] start watching at 3205894 I1211 12:12:40.367039 1 controller.go:319] apiserver closed stream I1211 12:12:40.368295 1 controller.go:312] start watching at 3205894 I1211 12:14:07.547117 1 controller.go:319] apiserver closed stream I1211 12:14:07.548197 1 controller.go:312] start watching at 3205894 I1211 12:15:38.436770 1 controller.go:319] apiserver closed stream I1211 12:15:38.440473 1 controller.go:312] start watching at 3205894 I1211 12:15:38.950789 1 controller.go:173] finding existing jobs... I1211 12:15:38.954460 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:15:38.954474 1 controller.go:89] Starting watch at version %v3206528 I1211 12:15:38.954477 1 controller.go:98] starts running from watch version: 3206528 I1211 12:15:38.955380 1 controller.go:312] start watching at 3206528 I1211 12:16:56.337170 1 controller.go:319] apiserver closed stream I1211 12:16:56.339251 1 controller.go:312] start watching at 3206528 I1211 12:18:09.375946 1 controller.go:319] apiserver closed stream I1211 12:18:09.377270 1 controller.go:312] start watching at 3206528 I1211 12:19:56.049895 1 controller.go:319] apiserver closed stream I1211 12:19:56.053032 1 controller.go:312] start watching at 3206528 I1211 12:19:56.561587 1 controller.go:173] finding existing jobs... I1211 12:19:56.563747 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:19:56.563780 1 controller.go:89] Starting watch at version %v3207027 I1211 12:19:56.563844 1 controller.go:98] starts running from watch version: 3207027 I1211 12:19:56.564888 1 controller.go:312] start watching at 3207027 I1211 12:21:43.374889 1 controller.go:319] apiserver closed stream I1211 12:21:43.377515 1 controller.go:312] start watching at 3207027 I1211 12:23:31.137252 1 controller.go:319] apiserver closed stream I1211 12:23:31.140348 1 controller.go:312] start watching at 3207027 I1211 12:25:11.639331 1 controller.go:319] apiserver closed stream I1211 12:25:11.643052 1 controller.go:312] start watching at 3207027 I1211 12:25:12.169407 1 controller.go:173] finding existing jobs... I1211 12:25:12.189800 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:25:12.189817 1 controller.go:89] Starting watch at version %v3207646 I1211 12:25:12.189821 1 controller.go:98] starts running from watch version: 3207646 I1211 12:25:12.190726 1 controller.go:312] start watching at 3207646 I1211 12:26:14.753648 1 controller.go:319] apiserver closed stream I1211 12:26:14.754894 1 controller.go:312] start watching at 3207646 I1211 12:27:41.620294 1 controller.go:319] apiserver closed stream I1211 12:27:41.623046 1 controller.go:312] start watching at 3207646 I1211 12:28:52.551135 1 controller.go:319] apiserver closed stream I1211 12:28:52.553974 1 controller.go:312] start watching at 3207646 I1211 12:30:08.682686 1 controller.go:319] apiserver closed stream I1211 12:30:08.686874 1 controller.go:312] start watching at 3207646 I1211 12:30:09.197625 1 controller.go:173] finding existing jobs... I1211 12:30:09.202729 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:30:09.202765 1 controller.go:89] Starting watch at version %v3208226 I1211 12:30:09.202768 1 controller.go:98] starts running from watch version: 3208226 I1211 12:30:09.206270 1 controller.go:312] start watching at 3208226 I1211 12:31:38.374757 1 controller.go:319] apiserver closed stream I1211 12:31:38.395739 1 controller.go:312] start watching at 3208226 I1211 12:32:53.274044 1 controller.go:319] apiserver closed stream I1211 12:32:53.275193 1 controller.go:312] start watching at 3208226 I1211 12:34:25.606341 1 controller.go:319] apiserver closed stream I1211 12:34:25.608445 1 controller.go:312] start watching at 3208226 I1211 12:35:38.819947 1 controller.go:319] apiserver closed stream I1211 12:35:38.823892 1 controller.go:312] start watching at 3208226 I1211 12:35:39.336388 1 controller.go:173] finding existing jobs... I1211 12:35:39.339352 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:35:39.339368 1 controller.go:89] Starting watch at version %v3208873 I1211 12:35:39.339372 1 controller.go:98] starts running from watch version: 3208873 I1211 12:35:39.340387 1 controller.go:312] start watching at 3208873 I1211 12:36:59.843612 1 controller.go:319] apiserver closed stream I1211 12:36:59.844839 1 controller.go:312] start watching at 3208873 I1211 12:38:06.617649 1 controller.go:319] apiserver closed stream I1211 12:38:06.619010 1 controller.go:312] start watching at 3208873 I1211 12:39:45.299564 1 controller.go:319] apiserver closed stream I1211 12:39:45.300614 1 controller.go:312] start watching at 3208873 I1211 12:40:57.956386 1 controller.go:319] apiserver closed stream I1211 12:40:57.960496 1 controller.go:312] start watching at 3208873 I1211 12:40:58.469243 1 controller.go:173] finding existing jobs... I1211 12:40:58.471492 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:40:58.471510 1 controller.go:89] Starting watch at version %v3209495 I1211 12:40:58.471514 1 controller.go:98] starts running from watch version: 3209495 I1211 12:40:58.472606 1 controller.go:312] start watching at 3209495 I1211 12:42:24.971185 1 controller.go:319] apiserver closed stream I1211 12:42:24.972175 1 controller.go:312] start watching at 3209495 I1211 12:44:20.589930 1 controller.go:319] apiserver closed stream I1211 12:44:20.591122 1 controller.go:312] start watching at 3209495 I1211 12:45:28.493946 1 controller.go:319] apiserver closed stream I1211 12:45:28.497563 1 controller.go:312] start watching at 3209495 I1211 12:45:29.006586 1 controller.go:173] finding existing jobs... I1211 12:45:29.008902 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:45:29.008932 1 controller.go:89] Starting watch at version %v3210021 I1211 12:45:29.008936 1 controller.go:98] starts running from watch version: 3210021 I1211 12:45:29.009925 1 controller.go:312] start watching at 3210021 I1211 12:47:15.526451 1 controller.go:319] apiserver closed stream I1211 12:47:15.528998 1 controller.go:312] start watching at 3210021 I1211 12:48:43.729295 1 controller.go:319] apiserver closed stream I1211 12:48:43.730845 1 controller.go:312] start watching at 3210021 I1211 12:50:26.951763 1 controller.go:319] apiserver closed stream I1211 12:50:26.955416 1 controller.go:312] start watching at 3210021 I1211 12:50:27.467071 1 controller.go:173] finding existing jobs... I1211 12:50:27.486874 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:50:27.486906 1 controller.go:89] Starting watch at version %v3210604 I1211 12:50:27.486909 1 controller.go:98] starts running from watch version: 3210604 I1211 12:50:27.487985 1 controller.go:312] start watching at 3210604 I1211 12:51:37.889202 1 controller.go:319] apiserver closed stream I1211 12:51:37.891899 1 controller.go:312] start watching at 3210604 I1211 12:53:34.772261 1 controller.go:319] apiserver closed stream I1211 12:53:34.773837 1 controller.go:312] start watching at 3210604 I1211 12:54:48.097971 1 controller.go:319] apiserver closed stream I1211 12:54:48.099615 1 controller.go:312] start watching at 3210604 I1211 12:56:14.191340 1 controller.go:319] apiserver closed stream I1211 12:56:14.196287 1 controller.go:312] start watching at 3210604 I1211 12:56:14.705578 1 controller.go:173] finding existing jobs... I1211 12:56:14.707802 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 12:56:14.707818 1 controller.go:89] Starting watch at version %v3211281 I1211 12:56:14.707822 1 controller.go:98] starts running from watch version: 3211281 I1211 12:56:14.708794 1 controller.go:312] start watching at 3211281 I1211 12:57:48.830019 1 controller.go:319] apiserver closed stream I1211 12:57:48.832612 1 controller.go:312] start watching at 3211281 I1211 12:59:33.895313 1 controller.go:319] apiserver closed stream I1211 12:59:33.897076 1 controller.go:312] start watching at 3211281 I1211 13:01:32.899737 1 controller.go:319] apiserver closed stream I1211 13:01:32.923952 1 controller.go:312] start watching at 3211281 I1211 13:01:33.610291 1 controller.go:173] finding existing jobs... I1211 13:01:33.612471 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:01:33.612571 1 controller.go:89] Starting watch at version %v3211895 I1211 13:01:33.612575 1 controller.go:98] starts running from watch version: 3211895 I1211 13:01:33.613402 1 controller.go:312] start watching at 3211895 I1211 13:02:42.115405 1 controller.go:319] apiserver closed stream I1211 13:02:42.116834 1 controller.go:312] start watching at 3211895 I1211 13:04:33.422539 1 controller.go:319] apiserver closed stream I1211 13:04:33.423644 1 controller.go:312] start watching at 3211895 I1211 13:06:10.548568 1 controller.go:319] apiserver closed stream I1211 13:06:10.552906 1 controller.go:312] start watching at 3211895 I1211 13:06:11.077199 1 controller.go:173] finding existing jobs... I1211 13:06:11.079483 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:06:11.079499 1 controller.go:89] Starting watch at version %v3212437 I1211 13:06:11.079502 1 controller.go:98] starts running from watch version: 3212437 I1211 13:06:11.080488 1 controller.go:312] start watching at 3212437 I1211 13:07:35.207364 1 controller.go:319] apiserver closed stream I1211 13:07:35.208496 1 controller.go:312] start watching at 3212437 I1211 13:09:04.778083 1 controller.go:319] apiserver closed stream I1211 13:09:04.779336 1 controller.go:312] start watching at 3212437 I1211 13:10:08.161377 1 controller.go:319] apiserver closed stream I1211 13:10:08.165319 1 controller.go:312] start watching at 3212437 I1211 13:10:08.673523 1 controller.go:173] finding existing jobs... I1211 13:10:08.675752 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:10:08.675770 1 controller.go:89] Starting watch at version %v3212905 I1211 13:10:08.675773 1 controller.go:98] starts running from watch version: 3212905 I1211 13:10:08.676755 1 controller.go:312] start watching at 3212905 I1211 13:11:48.874617 1 controller.go:319] apiserver closed stream I1211 13:11:48.875675 1 controller.go:312] start watching at 3212905 I1211 13:13:22.171190 1 controller.go:319] apiserver closed stream I1211 13:13:22.173071 1 controller.go:312] start watching at 3212905 I1211 13:14:54.160855 1 controller.go:319] apiserver closed stream I1211 13:14:54.164466 1 controller.go:312] start watching at 3212905 I1211 13:14:54.672788 1 controller.go:173] finding existing jobs... I1211 13:14:54.675265 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:14:54.675282 1 controller.go:89] Starting watch at version %v3213462 I1211 13:14:54.675286 1 controller.go:98] starts running from watch version: 3213462 I1211 13:14:54.676175 1 controller.go:312] start watching at 3213462 I1211 13:16:33.020277 1 controller.go:319] apiserver closed stream I1211 13:16:33.022227 1 controller.go:312] start watching at 3213462 I1211 13:17:34.221927 1 controller.go:319] apiserver closed stream I1211 13:17:34.223110 1 controller.go:312] start watching at 3213462 I1211 13:19:29.151671 1 controller.go:319] apiserver closed stream I1211 13:19:29.154047 1 controller.go:312] start watching at 3213462 I1211 13:20:45.502802 1 controller.go:319] apiserver closed stream I1211 13:20:45.506487 1 controller.go:312] start watching at 3213462 I1211 13:20:46.014583 1 controller.go:173] finding existing jobs... I1211 13:20:46.017413 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:20:46.017432 1 controller.go:89] Starting watch at version %v3214146 I1211 13:20:46.017435 1 controller.go:98] starts running from watch version: 3214146 I1211 13:20:46.018333 1 controller.go:312] start watching at 3214146 I1211 13:22:19.824647 1 controller.go:319] apiserver closed stream I1211 13:22:19.826179 1 controller.go:312] start watching at 3214146 I1211 13:23:25.177046 1 controller.go:319] apiserver closed stream I1211 13:23:25.178071 1 controller.go:312] start watching at 3214146 I1211 13:25:11.281614 1 controller.go:319] apiserver closed stream I1211 13:25:11.285265 1 controller.go:312] start watching at 3214146 I1211 13:25:11.794672 1 controller.go:173] finding existing jobs... I1211 13:25:11.797331 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:25:11.797364 1 controller.go:89] Starting watch at version %v3214669 I1211 13:25:11.797387 1 controller.go:98] starts running from watch version: 3214669 I1211 13:25:11.798373 1 controller.go:312] start watching at 3214669 I1211 13:26:36.065662 1 controller.go:319] apiserver closed stream I1211 13:26:36.066764 1 controller.go:312] start watching at 3214669 I1211 13:28:04.004330 1 controller.go:319] apiserver closed stream I1211 13:28:04.005467 1 controller.go:312] start watching at 3214669 I1211 13:29:09.804248 1 controller.go:319] apiserver closed stream I1211 13:29:09.806899 1 controller.go:312] start watching at 3214669 I1211 13:30:19.737904 1 controller.go:319] apiserver closed stream I1211 13:30:19.742386 1 controller.go:312] start watching at 3214669 I1211 13:30:20.251370 1 controller.go:173] finding existing jobs... I1211 13:30:20.254261 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:30:20.254275 1 controller.go:89] Starting watch at version %v3215269 I1211 13:30:20.254279 1 controller.go:98] starts running from watch version: 3215269 I1211 13:30:20.272420 1 controller.go:312] start watching at 3215269 I1211 13:31:36.604966 1 controller.go:319] apiserver closed stream I1211 13:31:36.606085 1 controller.go:312] start watching at 3215269 I1211 13:33:24.968713 1 controller.go:319] apiserver closed stream I1211 13:33:24.969917 1 controller.go:312] start watching at 3215269 I1211 13:34:58.155183 1 controller.go:319] apiserver closed stream I1211 13:34:58.158928 1 controller.go:312] start watching at 3215269 I1211 13:34:58.668423 1 controller.go:173] finding existing jobs... I1211 13:34:58.670705 1 controller.go:182] ignore failed TfJob (train-dv). Please delete its CRD I1211 13:34:58.670720 1 controller.go:89] Starting watch at version %v3215834 I1211 13:34:58.670723 1 controller.go:98] starts running from watch version: 3215834 I1211 13:34:58.671759 1 controller.go:312] start watching at 3215834 I1211 13:36:12.444318 1 controller.go:319] apiserver closed stream I1211 13:36:12.445375 1 controller.go:312] start watching at 3215834 I1211 13:37:13.406769 1 controller.go:319] apiserver closed stream I1211 13:37:13.461766 1 controller.go:312] start watching at 3215834 I1211 13:37:28.502746 1 controller.go:349] event: DELETED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 13:37:28.502876 1 controller.go:350] TfJob event: DELETED { "RuntimeId": "in2d", "tensorboard": { "logDir": "gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/examples/dataset_config.pbtxt", "--start_from_checkpoint=", "--master=", "--train_dir=gs://cloud-ml-dev_jlewi_deep_variant/experiments/2017_1210/model", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 13:39:13.027160 1 controller.go:319] apiserver closed stream I1211 13:39:13.028438 1 controller.go:312] start watching at 3127817 I1211 13:39:13.033306 1 controller.go:312] start watching at 3216381 I1211 13:40:28.159113 1 controller.go:319] apiserver closed stream I1211 13:40:28.162632 1 controller.go:312] start watching at 3216381 I1211 13:40:28.165848 1 controller.go:312] start watching at 3216523 I1211 13:42:08.751690 1 controller.go:319] apiserver closed stream I1211 13:42:08.753081 1 controller.go:312] start watching at 3216523 I1211 13:43:43.689143 1 controller.go:319] apiserver closed stream I1211 13:43:43.690387 1 controller.go:312] start watching at 3216523 I1211 13:45:37.043562 1 controller.go:319] apiserver closed stream I1211 13:45:37.047007 1 controller.go:312] start watching at 3216523 I1211 13:45:37.049930 1 controller.go:312] start watching at 3217141 I1211 13:47:28.575285 1 controller.go:319] apiserver closed stream I1211 13:47:28.576570 1 controller.go:312] start watching at 3217141 I1211 13:48:38.822676 1 controller.go:319] apiserver closed stream I1211 13:48:38.823816 1 controller.go:312] start watching at 3217141 I1211 13:49:44.000101 1 controller.go:319] apiserver closed stream I1211 13:49:44.001380 1 controller.go:312] start watching at 3217141 I1211 13:51:34.839322 1 controller.go:319] apiserver closed stream I1211 13:51:34.840540 1 controller.go:312] start watching at 3217141 I1211 13:51:34.843414 1 controller.go:312] start watching at 3217860 I1211 13:53:11.947842 1 controller.go:319] apiserver closed stream I1211 13:53:11.950360 1 controller.go:312] start watching at 3217860 I1211 13:54:59.223512 1 controller.go:319] apiserver closed stream I1211 13:54:59.226659 1 controller.go:312] start watching at 3217860 I1211 13:54:59.230230 1 controller.go:312] start watching at 3218248 I1211 13:56:51.394293 1 controller.go:319] apiserver closed stream I1211 13:56:51.396067 1 controller.go:312] start watching at 3218248 I1211 13:58:47.847156 1 controller.go:319] apiserver closed stream I1211 13:58:47.849924 1 controller.go:312] start watching at 3218248 I1211 13:59:54.429542 1 controller.go:319] apiserver closed stream I1211 13:59:54.432986 1 controller.go:312] start watching at 3218248 I1211 13:59:54.436083 1 controller.go:312] start watching at 3218839 I1211 14:01:25.204402 1 controller.go:319] apiserver closed stream I1211 14:01:25.205409 1 controller.go:312] start watching at 3218839 I1211 14:02:40.037542 1 controller.go:319] apiserver closed stream I1211 14:02:40.038814 1 controller.go:312] start watching at 3218839 I1211 14:04:22.979754 1 controller.go:319] apiserver closed stream I1211 14:04:22.980871 1 controller.go:312] start watching at 3218839 I1211 14:05:46.107112 1 controller.go:319] apiserver closed stream I1211 14:05:46.110125 1 controller.go:312] start watching at 3218839 I1211 14:05:46.112964 1 controller.go:312] start watching at 3219539 I1211 14:06:51.556341 1 controller.go:319] apiserver closed stream I1211 14:06:51.557567 1 controller.go:312] start watching at 3219539 I1211 14:08:36.643271 1 controller.go:319] apiserver closed stream I1211 14:08:36.645453 1 controller.go:312] start watching at 3219539 I1211 14:10:22.978947 1 controller.go:319] apiserver closed stream I1211 14:10:22.983129 1 controller.go:312] start watching at 3219539 I1211 14:10:22.986943 1 controller.go:312] start watching at 3220066 I1211 14:11:50.482028 1 controller.go:319] apiserver closed stream I1211 14:11:50.483041 1 controller.go:312] start watching at 3220066 I1211 14:13:01.968681 1 controller.go:319] apiserver closed stream I1211 14:13:01.970188 1 controller.go:312] start watching at 3220066 I1211 14:14:50.511714 1 controller.go:319] apiserver closed stream I1211 14:14:50.512774 1 controller.go:312] start watching at 3220066 I1211 14:16:16.028381 1 controller.go:319] apiserver closed stream I1211 14:16:16.032076 1 controller.go:312] start watching at 3220066 I1211 14:16:16.034874 1 controller.go:312] start watching at 3220763 I1211 14:17:23.598083 1 controller.go:319] apiserver closed stream I1211 14:17:23.603418 1 controller.go:312] start watching at 3220763 I1211 14:18:54.838584 1 controller.go:319] apiserver closed stream I1211 14:18:54.839730 1 controller.go:312] start watching at 3220763 I1211 14:20:00.924867 1 controller.go:319] apiserver closed stream I1211 14:20:00.928404 1 controller.go:312] start watching at 3220763 I1211 14:20:00.930996 1 controller.go:312] start watching at 3221198 I1211 14:21:11.596985 1 controller.go:319] apiserver closed stream I1211 14:21:11.598042 1 controller.go:312] start watching at 3221198 I1211 14:22:42.498070 1 controller.go:319] apiserver closed stream I1211 14:22:42.499377 1 controller.go:312] start watching at 3221198 I1211 14:24:15.292893 1 controller.go:319] apiserver closed stream I1211 14:24:15.294082 1 controller.go:312] start watching at 3221198 I1211 14:26:01.502204 1 controller.go:319] apiserver closed stream I1211 14:26:01.506188 1 controller.go:312] start watching at 3221198 I1211 14:26:01.509232 1 controller.go:312] start watching at 3221907 I1211 14:27:03.867255 1 controller.go:319] apiserver closed stream I1211 14:27:03.868225 1 controller.go:312] start watching at 3221907 I1211 14:28:10.348950 1 controller.go:319] apiserver closed stream I1211 14:28:10.350142 1 controller.go:312] start watching at 3221907 I1211 14:29:38.415912 1 controller.go:319] apiserver closed stream I1211 14:29:38.417009 1 controller.go:312] start watching at 3221907 I1211 14:30:52.259675 1 controller.go:319] apiserver closed stream I1211 14:30:52.262823 1 controller.go:312] start watching at 3221907 I1211 14:30:52.266391 1 controller.go:312] start watching at 3222483 I1211 14:32:06.324339 1 controller.go:319] apiserver closed stream I1211 14:32:06.325473 1 controller.go:312] start watching at 3222483 I1211 14:33:40.954535 1 controller.go:319] apiserver closed stream I1211 14:33:40.955703 1 controller.go:312] start watching at 3222483 I1211 14:34:49.244266 1 controller.go:319] apiserver closed stream I1211 14:34:49.246701 1 controller.go:312] start watching at 3222483 I1211 14:35:50.612688 1 controller.go:319] apiserver closed stream I1211 14:35:50.615722 1 controller.go:312] start watching at 3222483 I1211 14:35:50.618638 1 controller.go:312] start watching at 3223055 I1211 14:37:39.625035 1 controller.go:319] apiserver closed stream I1211 14:37:39.626236 1 controller.go:312] start watching at 3223055 I1211 14:39:34.585232 1 controller.go:319] apiserver closed stream I1211 14:39:34.588111 1 controller.go:312] start watching at 3223055 I1211 14:41:28.708037 1 controller.go:319] apiserver closed stream I1211 14:41:28.709129 1 controller.go:312] start watching at 3223055 I1211 14:41:28.711854 1 controller.go:312] start watching at 3223730 I1211 14:43:19.352290 1 controller.go:319] apiserver closed stream I1211 14:43:19.353120 1 controller.go:312] start watching at 3223730 I1211 14:44:59.728404 1 controller.go:319] apiserver closed stream I1211 14:44:59.732202 1 controller.go:312] start watching at 3223730 I1211 14:44:59.735205 1 controller.go:312] start watching at 3224168 I1211 14:46:53.029125 1 controller.go:319] apiserver closed stream I1211 14:46:53.030496 1 controller.go:312] start watching at 3224168 I1211 14:48:15.657432 1 controller.go:319] apiserver closed stream I1211 14:48:15.658541 1 controller.go:312] start watching at 3224168 I1211 14:50:03.110622 1 controller.go:319] apiserver closed stream I1211 14:50:03.113925 1 controller.go:312] start watching at 3224168 I1211 14:50:03.116782 1 controller.go:312] start watching at 3224747 I1211 14:51:56.213396 1 controller.go:319] apiserver closed stream I1211 14:51:56.214565 1 controller.go:312] start watching at 3224747 I1211 14:53:05.533084 1 controller.go:319] apiserver closed stream I1211 14:53:05.534271 1 controller.go:312] start watching at 3224747 I1211 14:54:28.656275 1 controller.go:319] apiserver closed stream I1211 14:54:28.657478 1 controller.go:312] start watching at 3224747 I1211 14:55:53.594191 1 controller.go:319] apiserver closed stream I1211 14:55:53.597689 1 controller.go:312] start watching at 3224747 I1211 14:55:53.600672 1 controller.go:312] start watching at 3225443 I1211 14:57:25.189947 1 controller.go:319] apiserver closed stream I1211 14:57:25.193371 1 controller.go:312] start watching at 3225443 I1211 14:58:41.241633 1 controller.go:319] apiserver closed stream I1211 14:58:41.243154 1 controller.go:312] start watching at 3225443 I1211 14:59:43.342811 1 controller.go:319] apiserver closed stream I1211 14:59:43.344155 1 controller.go:312] start watching at 3225443 I1211 15:01:21.017777 1 controller.go:319] apiserver closed stream I1211 15:01:21.020843 1 controller.go:312] start watching at 3225443 I1211 15:01:21.030351 1 controller.go:312] start watching at 3226096 I1211 15:02:57.728053 1 controller.go:319] apiserver closed stream I1211 15:02:57.729464 1 controller.go:312] start watching at 3226096 I1211 15:04:08.059768 1 controller.go:319] apiserver closed stream I1211 15:04:08.060883 1 controller.go:312] start watching at 3226096 I1211 15:05:38.248361 1 controller.go:319] apiserver closed stream I1211 15:05:38.252592 1 controller.go:312] start watching at 3226096 I1211 15:05:38.255804 1 controller.go:312] start watching at 3226589 I1211 15:07:19.467688 1 controller.go:319] apiserver closed stream I1211 15:07:19.468828 1 controller.go:312] start watching at 3226589 I1211 15:08:50.487997 1 controller.go:319] apiserver closed stream I1211 15:08:50.489023 1 controller.go:312] start watching at 3226589 I1211 15:10:02.919021 1 controller.go:319] apiserver closed stream I1211 15:10:02.922947 1 controller.go:312] start watching at 3226589 I1211 15:10:02.930701 1 controller.go:312] start watching at 3227119 I1211 15:11:42.021305 1 controller.go:319] apiserver closed stream I1211 15:11:42.022423 1 controller.go:312] start watching at 3227119 I1211 15:13:40.938532 1 controller.go:319] apiserver closed stream I1211 15:13:40.939844 1 controller.go:312] start watching at 3227119 I1211 15:14:55.623154 1 controller.go:319] apiserver closed stream I1211 15:14:55.628035 1 controller.go:312] start watching at 3227119 I1211 15:14:55.631743 1 controller.go:312] start watching at 3227698 I1211 15:16:08.078119 1 controller.go:319] apiserver closed stream I1211 15:16:08.079251 1 controller.go:312] start watching at 3227698 I1211 15:17:24.386183 1 controller.go:319] apiserver closed stream I1211 15:17:24.387274 1 controller.go:312] start watching at 3227698 I1211 15:18:49.756410 1 controller.go:319] apiserver closed stream I1211 15:18:49.757474 1 controller.go:312] start watching at 3227698 I1211 15:20:47.204296 1 controller.go:319] apiserver closed stream I1211 15:20:47.205934 1 controller.go:312] start watching at 3227698 I1211 15:20:47.211776 1 controller.go:312] start watching at 3228395 I1211 15:22:09.762262 1 controller.go:319] apiserver closed stream I1211 15:22:09.763438 1 controller.go:312] start watching at 3228395 I1211 15:23:46.291699 1 controller.go:319] apiserver closed stream I1211 15:23:46.292901 1 controller.go:312] start watching at 3228395 I1211 15:25:10.649709 1 controller.go:319] apiserver closed stream I1211 15:25:10.703308 1 controller.go:312] start watching at 3228395 I1211 15:25:10.713744 1 controller.go:312] start watching at 3228902 I1211 15:26:59.687309 1 controller.go:319] apiserver closed stream I1211 15:26:59.689951 1 controller.go:312] start watching at 3228902 I1211 15:28:10.151932 1 controller.go:319] apiserver closed stream I1211 15:28:10.153112 1 controller.go:312] start watching at 3228902 I1211 15:30:08.186120 1 controller.go:319] apiserver closed stream I1211 15:30:08.189169 1 controller.go:312] start watching at 3228902 I1211 15:30:08.192421 1 controller.go:312] start watching at 3229495 I1211 15:31:39.455805 1 controller.go:319] apiserver closed stream I1211 15:31:39.456940 1 controller.go:312] start watching at 3229495 I1211 15:33:03.655245 1 controller.go:319] apiserver closed stream I1211 15:33:03.656363 1 controller.go:312] start watching at 3229495 I1211 15:34:21.180083 1 controller.go:319] apiserver closed stream I1211 15:34:21.198648 1 controller.go:312] start watching at 3229495 I1211 15:35:23.498747 1 controller.go:319] apiserver closed stream I1211 15:35:23.500102 1 controller.go:312] start watching at 3229495 I1211 15:35:23.503330 1 controller.go:312] start watching at 3230135 I1211 15:36:37.797254 1 controller.go:319] apiserver closed stream I1211 15:36:37.798378 1 controller.go:312] start watching at 3230135 I1211 15:37:40.149063 1 controller.go:319] apiserver closed stream I1211 15:37:40.150057 1 controller.go:312] start watching at 3230135 I1211 15:39:23.278905 1 controller.go:319] apiserver closed stream I1211 15:39:23.280490 1 controller.go:312] start watching at 3230135 I1211 15:40:45.791398 1 controller.go:319] apiserver closed stream I1211 15:40:45.795109 1 controller.go:312] start watching at 3230135 I1211 15:40:45.798139 1 controller.go:312] start watching at 3230750 I1211 15:42:40.289951 1 controller.go:319] apiserver closed stream I1211 15:42:40.291151 1 controller.go:312] start watching at 3230750 I1211 15:44:25.152157 1 controller.go:319] apiserver closed stream I1211 15:44:25.154141 1 controller.go:312] start watching at 3230750 I1211 15:46:17.020770 1 controller.go:319] apiserver closed stream I1211 15:46:17.021954 1 controller.go:312] start watching at 3230750 I1211 15:46:17.027591 1 controller.go:312] start watching at 3231412 I1211 15:47:59.872002 1 controller.go:319] apiserver closed stream I1211 15:47:59.873033 1 controller.go:312] start watching at 3231412 I1211 15:49:53.999896 1 controller.go:319] apiserver closed stream I1211 15:49:54.003615 1 controller.go:312] start watching at 3231412 I1211 15:49:54.006788 1 controller.go:312] start watching at 3231858 I1211 15:51:01.970302 1 controller.go:319] apiserver closed stream I1211 15:51:01.971542 1 controller.go:312] start watching at 3231858 I1211 15:52:06.751776 1 controller.go:319] apiserver closed stream I1211 15:52:06.753002 1 controller.go:312] start watching at 3231858 I1211 15:53:55.724076 1 controller.go:319] apiserver closed stream I1211 15:53:55.737004 1 controller.go:312] start watching at 3231858 I1211 15:55:46.875568 1 controller.go:319] apiserver closed stream I1211 15:55:46.879265 1 controller.go:312] start watching at 3231858 I1211 15:55:46.882039 1 controller.go:312] start watching at 3232534 I1211 15:57:18.763389 1 controller.go:319] apiserver closed stream I1211 15:57:18.764636 1 controller.go:312] start watching at 3232534 I1211 15:59:04.433653 1 controller.go:319] apiserver closed stream I1211 15:59:04.434964 1 controller.go:312] start watching at 3232534 I1211 16:00:11.053957 1 controller.go:319] apiserver closed stream I1211 16:00:11.058757 1 controller.go:312] start watching at 3232534 I1211 16:00:11.065038 1 controller.go:312] start watching at 3233048 I1211 16:01:31.990773 1 controller.go:319] apiserver closed stream I1211 16:01:31.992182 1 controller.go:312] start watching at 3233048 I1211 16:02:48.101421 1 controller.go:319] apiserver closed stream I1211 16:02:48.102598 1 controller.go:312] start watching at 3233048 I1211 16:04:16.356569 1 controller.go:319] apiserver closed stream I1211 16:04:16.357949 1 controller.go:312] start watching at 3233048 I1211 16:05:29.190285 1 controller.go:319] apiserver closed stream I1211 16:05:29.194110 1 controller.go:312] start watching at 3233048 I1211 16:05:29.196966 1 controller.go:312] start watching at 3233677 I1211 16:07:12.959827 1 controller.go:319] apiserver closed stream I1211 16:07:12.961143 1 controller.go:312] start watching at 3233677 I1211 16:09:10.884576 1 controller.go:319] apiserver closed stream I1211 16:09:10.885490 1 controller.go:312] start watching at 3233677 I1211 16:10:52.948858 1 controller.go:319] apiserver closed stream I1211 16:10:52.953254 1 controller.go:312] start watching at 3233677 I1211 16:10:52.956002 1 controller.go:312] start watching at 3234294 I1211 16:12:44.415280 1 controller.go:319] apiserver closed stream I1211 16:12:44.416422 1 controller.go:312] start watching at 3234294 I1211 16:14:21.270909 1 controller.go:319] apiserver closed stream I1211 16:14:21.272245 1 controller.go:312] start watching at 3234294 I1211 16:15:33.152820 1 controller.go:319] apiserver closed stream I1211 16:15:33.156332 1 controller.go:312] start watching at 3234294 I1211 16:15:33.158995 1 controller.go:312] start watching at 3234850 I1211 16:17:14.864866 1 controller.go:319] apiserver closed stream I1211 16:17:14.866013 1 controller.go:312] start watching at 3234850 I1211 16:18:28.697163 1 controller.go:319] apiserver closed stream I1211 16:18:28.698244 1 controller.go:312] start watching at 3234850 I1211 16:20:23.923162 1 controller.go:319] apiserver closed stream I1211 16:20:23.939944 1 controller.go:312] start watching at 3234850 I1211 16:20:23.957064 1 controller.go:312] start watching at 3235405 I1211 16:21:47.282340 1 controller.go:319] apiserver closed stream I1211 16:21:47.283649 1 controller.go:312] start watching at 3235405 I1211 16:23:20.549945 1 controller.go:319] apiserver closed stream I1211 16:23:20.552027 1 controller.go:312] start watching at 3235405 I1211 16:24:28.274640 1 controller.go:319] apiserver closed stream I1211 16:24:28.277523 1 controller.go:312] start watching at 3235405 I1211 16:25:50.250916 1 controller.go:319] apiserver closed stream I1211 16:25:50.254736 1 controller.go:312] start watching at 3235405 I1211 16:25:50.259009 1 controller.go:312] start watching at 3236062 I1211 16:27:14.452802 1 controller.go:319] apiserver closed stream I1211 16:27:14.454125 1 controller.go:312] start watching at 3236062 I1211 16:28:36.049324 1 controller.go:319] apiserver closed stream I1211 16:28:36.050760 1 controller.go:312] start watching at 3236062 I1211 16:30:13.859044 1 controller.go:319] apiserver closed stream I1211 16:30:13.862461 1 controller.go:312] start watching at 3236062 I1211 16:30:13.865691 1 controller.go:312] start watching at 3236600 I1211 16:31:43.558503 1 controller.go:319] apiserver closed stream I1211 16:31:43.559362 1 controller.go:312] start watching at 3236600 I1211 16:33:32.589272 1 controller.go:319] apiserver closed stream I1211 16:33:32.590367 1 controller.go:312] start watching at 3236600 I1211 16:34:46.853947 1 controller.go:319] apiserver closed stream I1211 16:34:46.856084 1 controller.go:312] start watching at 3236600 I1211 16:36:28.652434 1 controller.go:319] apiserver closed stream I1211 16:36:28.653649 1 controller.go:312] start watching at 3236600 I1211 16:36:28.656295 1 controller.go:312] start watching at 3237332 I1211 16:38:27.497550 1 controller.go:319] apiserver closed stream I1211 16:38:27.498587 1 controller.go:312] start watching at 3237332 I1211 16:39:41.345037 1 controller.go:319] apiserver closed stream I1211 16:39:41.346282 1 controller.go:312] start watching at 3237332 I1211 16:41:22.719233 1 controller.go:319] apiserver closed stream I1211 16:41:22.720191 1 controller.go:312] start watching at 3237332 I1211 16:41:22.722805 1 controller.go:312] start watching at 3237895 I1211 16:42:29.345485 1 controller.go:319] apiserver closed stream I1211 16:42:29.346858 1 controller.go:312] start watching at 3237895 I1211 16:43:58.335055 1 controller.go:319] apiserver closed stream I1211 16:43:58.336157 1 controller.go:312] start watching at 3237895 I1211 16:45:32.869820 1 controller.go:319] apiserver closed stream I1211 16:45:32.875615 1 controller.go:312] start watching at 3237895 I1211 16:45:32.878491 1 controller.go:312] start watching at 3238406 I1211 16:46:37.391326 1 controller.go:319] apiserver closed stream I1211 16:46:37.394482 1 controller.go:312] start watching at 3238406 I1211 16:48:09.135771 1 controller.go:319] apiserver closed stream I1211 16:48:09.137285 1 controller.go:312] start watching at 3238406 I1211 16:49:10.306007 1 controller.go:319] apiserver closed stream I1211 16:49:10.308606 1 controller.go:312] start watching at 3238406 I1211 16:51:03.714118 1 controller.go:319] apiserver closed stream I1211 16:51:03.715228 1 controller.go:312] start watching at 3238406 I1211 16:51:03.737978 1 controller.go:312] start watching at 3239072 I1211 16:53:02.491190 1 controller.go:319] apiserver closed stream I1211 16:53:02.492801 1 controller.go:312] start watching at 3239072 I1211 16:54:18.985356 1 controller.go:319] apiserver closed stream I1211 16:54:18.989124 1 controller.go:312] start watching at 3239072 I1211 16:55:20.249673 1 controller.go:319] apiserver closed stream I1211 16:55:20.252983 1 controller.go:312] start watching at 3239072 I1211 16:55:20.255631 1 controller.go:312] start watching at 3239562 I1211 16:56:43.215519 1 controller.go:319] apiserver closed stream I1211 16:56:43.216641 1 controller.go:312] start watching at 3239562 I1211 16:58:28.667517 1 controller.go:319] apiserver closed stream I1211 16:58:28.668852 1 controller.go:312] start watching at 3239562 I1211 16:59:51.057242 1 controller.go:319] apiserver closed stream I1211 16:59:51.058282 1 controller.go:312] start watching at 3239562 I1211 17:00:57.315457 1 controller.go:319] apiserver closed stream I1211 17:00:57.320200 1 controller.go:312] start watching at 3239562 I1211 17:00:57.323089 1 controller.go:312] start watching at 3240228 I1211 17:02:11.802582 1 controller.go:319] apiserver closed stream I1211 17:02:11.805209 1 controller.go:312] start watching at 3240228 I1211 17:03:20.115996 1 controller.go:319] apiserver closed stream I1211 17:03:20.118592 1 controller.go:312] start watching at 3240228 I1211 17:04:57.944832 1 controller.go:319] apiserver closed stream I1211 17:04:57.948407 1 controller.go:312] start watching at 3240228 I1211 17:04:57.951257 1 controller.go:312] start watching at 3240687 I1211 17:06:34.991187 1 controller.go:319] apiserver closed stream I1211 17:06:34.992269 1 controller.go:312] start watching at 3240687 I1211 17:07:53.155700 1 controller.go:319] apiserver closed stream I1211 17:07:53.156917 1 controller.go:312] start watching at 3240687 I1211 17:09:49.241462 1 controller.go:319] apiserver closed stream I1211 17:09:49.242755 1 controller.go:312] start watching at 3240687 I1211 17:11:15.241874 1 controller.go:319] apiserver closed stream I1211 17:11:15.242983 1 controller.go:312] start watching at 3240687 I1211 17:11:15.245749 1 controller.go:312] start watching at 3241443 I1211 17:13:05.588147 1 controller.go:319] apiserver closed stream I1211 17:13:05.589343 1 controller.go:312] start watching at 3241443 I1211 17:14:55.867192 1 controller.go:319] apiserver closed stream I1211 17:14:55.871927 1 controller.go:312] start watching at 3241443 I1211 17:14:55.874542 1 controller.go:312] start watching at 3241889 I1211 17:16:49.829739 1 controller.go:319] apiserver closed stream I1211 17:16:49.830900 1 controller.go:312] start watching at 3241889 I1211 17:18:26.939375 1 controller.go:319] apiserver closed stream I1211 17:18:26.941054 1 controller.go:312] start watching at 3241889 I1211 17:19:27.986579 1 controller.go:319] apiserver closed stream I1211 17:19:27.987693 1 controller.go:312] start watching at 3241889 I1211 17:20:31.410569 1 controller.go:319] apiserver closed stream I1211 17:20:31.415349 1 controller.go:312] start watching at 3241889 I1211 17:20:31.418146 1 controller.go:312] start watching at 3242534 I1211 17:21:55.017909 1 controller.go:319] apiserver closed stream I1211 17:21:55.020603 1 controller.go:312] start watching at 3242534 I1211 17:23:01.537793 1 controller.go:319] apiserver closed stream I1211 17:23:01.540102 1 controller.go:312] start watching at 3242534 I1211 17:24:26.272355 1 controller.go:319] apiserver closed stream I1211 17:24:26.274118 1 controller.go:312] start watching at 3242534 I1211 17:26:15.046430 1 controller.go:319] apiserver closed stream I1211 17:26:15.050477 1 controller.go:312] start watching at 3242534 I1211 17:26:15.053757 1 controller.go:312] start watching at 3243215 I1211 17:27:29.977186 1 controller.go:319] apiserver closed stream I1211 17:27:29.980476 1 controller.go:312] start watching at 3243215 I1211 17:28:41.019881 1 controller.go:319] apiserver closed stream I1211 17:28:41.027061 1 controller.go:312] start watching at 3243215 I1211 17:29:55.001121 1 controller.go:319] apiserver closed stream I1211 17:29:55.004997 1 controller.go:312] start watching at 3243215 I1211 17:29:55.008178 1 controller.go:312] start watching at 3243634 I1211 17:31:29.321899 1 controller.go:319] apiserver closed stream I1211 17:31:29.323320 1 controller.go:312] start watching at 3243634 I1211 17:32:57.986388 1 controller.go:319] apiserver closed stream I1211 17:32:57.987768 1 controller.go:312] start watching at 3243634 I1211 17:34:21.942650 1 controller.go:319] apiserver closed stream I1211 17:34:21.943851 1 controller.go:312] start watching at 3243634 I1211 17:35:31.418145 1 controller.go:319] apiserver closed stream I1211 17:35:31.421520 1 controller.go:312] start watching at 3243634 I1211 17:35:31.424862 1 controller.go:312] start watching at 3244305 I1211 17:37:07.333938 1 controller.go:319] apiserver closed stream I1211 17:37:07.335147 1 controller.go:312] start watching at 3244305 I1211 17:38:50.136811 1 controller.go:319] apiserver closed stream I1211 17:38:50.137937 1 controller.go:312] start watching at 3244305 I1211 17:40:29.511150 1 controller.go:319] apiserver closed stream I1211 17:40:29.515657 1 controller.go:312] start watching at 3244305 I1211 17:40:29.518494 1 controller.go:312] start watching at 3244876 I1211 17:42:20.899293 1 controller.go:319] apiserver closed stream I1211 17:42:20.900586 1 controller.go:312] start watching at 3244876 I1211 17:43:45.323476 1 controller.go:319] apiserver closed stream I1211 17:43:45.324815 1 controller.go:312] start watching at 3244876 I1211 17:44:59.466990 1 controller.go:319] apiserver closed stream I1211 17:44:59.471132 1 controller.go:312] start watching at 3244876 I1211 17:44:59.474455 1 controller.go:312] start watching at 3245404 I1211 17:46:22.704850 1 controller.go:319] apiserver closed stream I1211 17:46:22.705990 1 controller.go:312] start watching at 3245404 I1211 17:48:08.895902 1 controller.go:319] apiserver closed stream I1211 17:48:08.897013 1 controller.go:312] start watching at 3245404 I1211 17:49:12.823177 1 controller.go:319] apiserver closed stream I1211 17:49:12.824306 1 controller.go:312] start watching at 3245404 I1211 17:50:34.048666 1 controller.go:319] apiserver closed stream I1211 17:50:34.052762 1 controller.go:312] start watching at 3245404 I1211 17:50:34.055785 1 controller.go:312] start watching at 3246077 I1211 17:51:45.425729 1 controller.go:319] apiserver closed stream I1211 17:51:45.427105 1 controller.go:312] start watching at 3246077 I1211 17:53:17.636126 1 controller.go:319] apiserver closed stream I1211 17:53:17.637384 1 controller.go:312] start watching at 3246077 I1211 17:54:55.572256 1 controller.go:319] apiserver closed stream I1211 17:54:55.576955 1 controller.go:312] start watching at 3246077 I1211 17:54:55.580324 1 controller.go:312] start watching at 3246609 I1211 17:56:18.123494 1 controller.go:319] apiserver closed stream I1211 17:56:18.124798 1 controller.go:312] start watching at 3246609 I1211 17:57:22.760011 1 controller.go:319] apiserver closed stream I1211 17:57:22.761237 1 controller.go:312] start watching at 3246609 I1211 17:58:52.581937 1 controller.go:319] apiserver closed stream I1211 17:58:52.583435 1 controller.go:312] start watching at 3246609 I1211 18:00:32.654740 1 controller.go:319] apiserver closed stream I1211 18:00:32.657933 1 controller.go:312] start watching at 3246609 I1211 18:00:32.661148 1 controller.go:312] start watching at 3247288 I1211 18:01:35.007868 1 controller.go:319] apiserver closed stream I1211 18:01:35.009020 1 controller.go:312] start watching at 3247288 I1211 18:02:47.647797 1 controller.go:319] apiserver closed stream I1211 18:02:47.649158 1 controller.go:312] start watching at 3247288 I1211 18:04:12.744083 1 controller.go:319] apiserver closed stream I1211 18:04:12.745927 1 controller.go:312] start watching at 3247288 I1211 18:05:53.748151 1 controller.go:319] apiserver closed stream I1211 18:05:53.751836 1 controller.go:312] start watching at 3247288 I1211 18:05:53.754632 1 controller.go:312] start watching at 3247901 I1211 18:07:01.756114 1 controller.go:319] apiserver closed stream I1211 18:07:01.757273 1 controller.go:312] start watching at 3247901 I1211 18:08:53.561077 1 controller.go:319] apiserver closed stream I1211 18:08:53.562298 1 controller.go:312] start watching at 3247901 I1211 18:10:40.881853 1 controller.go:319] apiserver closed stream I1211 18:10:40.885197 1 controller.go:312] start watching at 3247901 I1211 18:10:40.888167 1 controller.go:312] start watching at 3248485 I1211 18:12:39.580649 1 controller.go:319] apiserver closed stream I1211 18:12:39.581714 1 controller.go:312] start watching at 3248485 I1211 18:13:54.746892 1 controller.go:319] apiserver closed stream I1211 18:13:54.748126 1 controller.go:312] start watching at 3248485 I1211 18:15:33.599976 1 controller.go:319] apiserver closed stream I1211 18:15:33.601613 1 controller.go:312] start watching at 3248485 I1211 18:15:33.604494 1 controller.go:312] start watching at 3249063 I1211 18:17:27.446037 1 controller.go:319] apiserver closed stream I1211 18:17:27.448199 1 controller.go:312] start watching at 3249063 I1211 18:19:04.045348 1 controller.go:319] apiserver closed stream I1211 18:19:04.046756 1 controller.go:312] start watching at 3249063 I1211 18:20:14.369367 1 controller.go:319] apiserver closed stream I1211 18:20:14.375034 1 controller.go:312] start watching at 3249063 I1211 18:20:14.377935 1 controller.go:312] start watching at 3249610 I1211 18:21:17.284161 1 controller.go:319] apiserver closed stream I1211 18:21:17.285186 1 controller.go:312] start watching at 3249610 I1211 18:23:01.193491 1 controller.go:319] apiserver closed stream I1211 18:23:01.194514 1 controller.go:312] start watching at 3249610 I1211 18:24:43.091788 1 controller.go:319] apiserver closed stream I1211 18:24:43.093234 1 controller.go:312] start watching at 3249610 I1211 18:25:52.043366 1 controller.go:319] apiserver closed stream I1211 18:25:52.048752 1 controller.go:312] start watching at 3249610 I1211 18:25:52.054264 1 controller.go:312] start watching at 3250274 I1211 18:27:35.802288 1 controller.go:319] apiserver closed stream I1211 18:27:35.803659 1 controller.go:312] start watching at 3250274 I1211 18:28:47.874436 1 controller.go:319] apiserver closed stream I1211 18:28:47.875498 1 controller.go:312] start watching at 3250274 I1211 18:30:08.295812 1 controller.go:319] apiserver closed stream I1211 18:30:08.299073 1 controller.go:312] start watching at 3250274 I1211 18:30:08.302024 1 controller.go:312] start watching at 3250792 I1211 18:31:14.976366 1 controller.go:319] apiserver closed stream I1211 18:31:14.977411 1 controller.go:312] start watching at 3250792 I1211 18:32:38.431160 1 controller.go:319] apiserver closed stream I1211 18:32:38.432344 1 controller.go:312] start watching at 3250792 I1211 18:34:20.271069 1 controller.go:319] apiserver closed stream I1211 18:34:20.272258 1 controller.go:312] start watching at 3250792 I1211 18:36:08.055124 1 controller.go:319] apiserver closed stream I1211 18:36:08.077778 1 controller.go:312] start watching at 3250792 I1211 18:36:08.082621 1 controller.go:312] start watching at 3251501 I1211 18:37:44.077408 1 controller.go:319] apiserver closed stream I1211 18:37:44.078527 1 controller.go:312] start watching at 3251501 I1211 18:39:34.884161 1 controller.go:319] apiserver closed stream I1211 18:39:34.885282 1 controller.go:312] start watching at 3251501 I1211 18:41:13.513620 1 controller.go:319] apiserver closed stream I1211 18:41:13.517733 1 controller.go:312] start watching at 3251501 I1211 18:41:13.521131 1 controller.go:312] start watching at 3252084 I1211 18:42:38.734047 1 controller.go:319] apiserver closed stream I1211 18:42:38.735137 1 controller.go:312] start watching at 3252084 I1211 18:43:54.932784 1 controller.go:319] apiserver closed stream I1211 18:43:54.933819 1 controller.go:312] start watching at 3252084 I1211 18:45:53.623550 1 controller.go:319] apiserver closed stream I1211 18:45:53.624662 1 controller.go:312] start watching at 3252084 I1211 18:45:53.627501 1 controller.go:312] start watching at 3252650 I1211 18:47:27.075651 1 controller.go:319] apiserver closed stream I1211 18:47:27.076866 1 controller.go:312] start watching at 3252650 I1211 18:48:37.509047 1 controller.go:319] apiserver closed stream I1211 18:48:37.510041 1 controller.go:312] start watching at 3252650 I1211 18:49:59.651182 1 controller.go:319] apiserver closed stream I1211 18:49:59.657749 1 controller.go:312] start watching at 3252650 I1211 18:49:59.661827 1 controller.go:312] start watching at 3253124 I1211 18:51:31.571385 1 controller.go:319] apiserver closed stream I1211 18:51:31.572644 1 controller.go:312] start watching at 3253124 I1211 18:53:07.621661 1 controller.go:319] apiserver closed stream I1211 18:53:07.622739 1 controller.go:312] start watching at 3253124 I1211 18:54:43.701912 1 controller.go:319] apiserver closed stream I1211 18:54:43.703057 1 controller.go:312] start watching at 3253124 I1211 18:56:06.593333 1 controller.go:319] apiserver closed stream I1211 18:56:06.597172 1 controller.go:312] start watching at 3253124 I1211 18:56:06.600312 1 controller.go:312] start watching at 3253849 I1211 18:57:31.735796 1 controller.go:319] apiserver closed stream I1211 18:57:31.736882 1 controller.go:312] start watching at 3253849 I1211 18:59:21.015873 1 controller.go:319] apiserver closed stream I1211 18:59:21.017121 1 controller.go:312] start watching at 3253849 I1211 19:00:29.859070 1 controller.go:319] apiserver closed stream I1211 19:00:29.862804 1 controller.go:312] start watching at 3253849 I1211 19:00:29.865658 1 controller.go:312] start watching at 3254355 I1211 19:02:27.628912 1 controller.go:319] apiserver closed stream I1211 19:02:27.631184 1 controller.go:312] start watching at 3254355 I1211 19:04:10.240849 1 controller.go:319] apiserver closed stream I1211 19:04:10.242132 1 controller.go:312] start watching at 3254355 I1211 19:06:03.693493 1 controller.go:319] apiserver closed stream I1211 19:06:03.696458 1 controller.go:312] start watching at 3254355 I1211 19:06:03.699629 1 controller.go:312] start watching at 3255015 I1211 19:07:50.662790 1 controller.go:319] apiserver closed stream I1211 19:07:50.666875 1 controller.go:312] start watching at 3255015 I1211 19:09:42.697600 1 controller.go:319] apiserver closed stream I1211 19:09:42.699938 1 controller.go:312] start watching at 3255015 I1211 19:10:45.999098 1 controller.go:319] apiserver closed stream I1211 19:10:46.002617 1 controller.go:312] start watching at 3255015 I1211 19:10:46.006093 1 controller.go:312] start watching at 3255584 I1211 19:12:22.554666 1 controller.go:319] apiserver closed stream I1211 19:12:22.555994 1 controller.go:312] start watching at 3255584 I1211 19:13:25.223080 1 controller.go:319] apiserver closed stream I1211 19:13:25.225348 1 controller.go:312] start watching at 3255584 I1211 19:15:03.369730 1 controller.go:319] apiserver closed stream I1211 19:15:03.373042 1 controller.go:312] start watching at 3255584 I1211 19:15:03.376467 1 controller.go:312] start watching at 3256075 I1211 19:16:18.697811 1 controller.go:319] apiserver closed stream I1211 19:16:18.698958 1 controller.go:312] start watching at 3256075 I1211 19:17:46.415326 1 controller.go:319] apiserver closed stream I1211 19:17:46.416615 1 controller.go:312] start watching at 3256075 I1211 19:19:08.172634 1 controller.go:319] apiserver closed stream I1211 19:19:08.173987 1 controller.go:312] start watching at 3256075 I1211 19:21:05.880018 1 controller.go:319] apiserver closed stream I1211 19:21:05.881324 1 controller.go:312] start watching at 3256075 I1211 19:21:05.884086 1 controller.go:312] start watching at 3256803 I1211 19:22:38.482401 1 controller.go:319] apiserver closed stream I1211 19:22:38.483430 1 controller.go:312] start watching at 3256803 I1211 19:24:30.307528 1 controller.go:319] apiserver closed stream I1211 19:24:30.308709 1 controller.go:312] start watching at 3256803 I1211 19:25:56.557451 1 controller.go:319] apiserver closed stream I1211 19:25:56.560654 1 controller.go:312] start watching at 3256803 I1211 19:25:56.563714 1 controller.go:312] start watching at 3257360 I1211 19:27:21.298696 1 controller.go:319] apiserver closed stream I1211 19:27:21.299689 1 controller.go:312] start watching at 3257360 I1211 19:29:02.830841 1 controller.go:319] apiserver closed stream I1211 19:29:02.833330 1 controller.go:312] start watching at 3257360 I1211 19:30:26.479659 1 controller.go:319] apiserver closed stream I1211 19:30:26.504199 1 controller.go:312] start watching at 3257360 I1211 19:30:26.507075 1 controller.go:312] start watching at 3257877 I1211 19:31:54.020085 1 controller.go:319] apiserver closed stream I1211 19:31:54.022121 1 controller.go:312] start watching at 3257877 I1211 19:33:14.651380 1 controller.go:319] apiserver closed stream I1211 19:33:14.652713 1 controller.go:312] start watching at 3257877 I1211 19:35:09.921314 1 controller.go:319] apiserver closed stream I1211 19:35:09.924578 1 controller.go:312] start watching at 3257877 I1211 19:35:09.927628 1 controller.go:312] start watching at 3258416 I1211 19:36:21.391704 1 controller.go:319] apiserver closed stream I1211 19:36:21.392789 1 controller.go:312] start watching at 3258416 I1211 19:37:36.505153 1 controller.go:319] apiserver closed stream I1211 19:37:36.508288 1 controller.go:312] start watching at 3258416 I1211 19:39:11.935077 1 controller.go:319] apiserver closed stream I1211 19:39:11.936076 1 controller.go:312] start watching at 3258416 I1211 19:40:42.984236 1 controller.go:319] apiserver closed stream I1211 19:40:42.991626 1 controller.go:312] start watching at 3258416 I1211 19:40:42.995917 1 controller.go:312] start watching at 3259055 I1211 19:42:27.332090 1 controller.go:319] apiserver closed stream I1211 19:42:27.333351 1 controller.go:312] start watching at 3259055 I1211 19:43:28.653924 1 controller.go:319] apiserver closed stream I1211 19:43:28.654960 1 controller.go:312] start watching at 3259055 I1211 19:44:46.781370 1 controller.go:319] apiserver closed stream I1211 19:44:46.782588 1 controller.go:312] start watching at 3259055 I1211 19:46:12.417243 1 controller.go:319] apiserver closed stream I1211 19:46:12.419469 1 controller.go:312] start watching at 3259055 I1211 19:46:12.421890 1 controller.go:312] start watching at 3259684 I1211 19:47:57.555941 1 controller.go:319] apiserver closed stream I1211 19:47:57.559196 1 controller.go:312] start watching at 3259684 I1211 19:49:56.235010 1 controller.go:319] apiserver closed stream I1211 19:49:56.237658 1 controller.go:312] start watching at 3259684 I1211 19:49:56.241170 1 controller.go:312] start watching at 3260112 I1211 19:51:53.506972 1 controller.go:319] apiserver closed stream I1211 19:51:53.508033 1 controller.go:312] start watching at 3260112 I1211 19:53:49.828375 1 controller.go:319] apiserver closed stream I1211 19:53:49.829579 1 controller.go:312] start watching at 3260112 I1211 19:55:29.086904 1 controller.go:319] apiserver closed stream I1211 19:55:29.090433 1 controller.go:312] start watching at 3260112 I1211 19:55:29.093139 1 controller.go:312] start watching at 3260746 I1211 19:57:07.437401 1 controller.go:319] apiserver closed stream I1211 19:57:07.438356 1 controller.go:312] start watching at 3260746 I1211 19:58:17.899511 1 controller.go:319] apiserver closed stream I1211 19:58:17.900833 1 controller.go:312] start watching at 3260746 I1211 19:59:54.555253 1 controller.go:319] apiserver closed stream I1211 19:59:54.558832 1 controller.go:312] start watching at 3260746 I1211 19:59:54.561944 1 controller.go:312] start watching at 3261256 I1211 20:01:38.115517 1 controller.go:319] apiserver closed stream I1211 20:01:38.116614 1 controller.go:312] start watching at 3261256 I1211 20:03:27.021676 1 controller.go:319] apiserver closed stream I1211 20:03:27.027237 1 controller.go:312] start watching at 3261256 I1211 20:04:47.398989 1 controller.go:319] apiserver closed stream I1211 20:04:47.401510 1 controller.go:312] start watching at 3261256 I1211 20:06:20.146831 1 controller.go:319] apiserver closed stream I1211 20:06:20.148177 1 controller.go:312] start watching at 3261256 I1211 20:06:20.150898 1 controller.go:312] start watching at 3261991 I1211 20:08:10.618512 1 controller.go:319] apiserver closed stream I1211 20:08:10.620403 1 controller.go:312] start watching at 3261991 I1211 20:10:10.003982 1 controller.go:319] apiserver closed stream I1211 20:10:10.010161 1 controller.go:312] start watching at 3261991 I1211 20:10:10.015951 1 controller.go:312] start watching at 3262431 I1211 20:11:51.933313 1 controller.go:319] apiserver closed stream I1211 20:11:51.934512 1 controller.go:312] start watching at 3262431 I1211 20:13:48.945959 1 controller.go:319] apiserver closed stream I1211 20:13:48.947299 1 controller.go:312] start watching at 3262431 I1211 20:15:47.878568 1 controller.go:319] apiserver closed stream I1211 20:15:47.883078 1 controller.go:312] start watching at 3262431 I1211 20:15:47.885711 1 controller.go:312] start watching at 3263079 I1211 20:17:13.464997 1 controller.go:319] apiserver closed stream I1211 20:17:13.466039 1 controller.go:312] start watching at 3263079 I1211 20:19:10.381908 1 controller.go:319] apiserver closed stream I1211 20:19:10.382972 1 controller.go:312] start watching at 3263079 I1211 20:20:30.481151 1 controller.go:319] apiserver closed stream I1211 20:20:30.484897 1 controller.go:312] start watching at 3263079 I1211 20:20:30.487695 1 controller.go:312] start watching at 3263616 I1211 20:21:48.228261 1 controller.go:319] apiserver closed stream I1211 20:21:48.230993 1 controller.go:312] start watching at 3263616 I1211 20:23:01.740077 1 controller.go:319] apiserver closed stream I1211 20:23:01.741701 1 controller.go:312] start watching at 3263616 I1211 20:24:15.595928 1 controller.go:319] apiserver closed stream I1211 20:24:15.597333 1 controller.go:312] start watching at 3263616 I1211 20:25:50.118612 1 controller.go:319] apiserver closed stream I1211 20:25:50.122478 1 controller.go:312] start watching at 3263616 I1211 20:25:50.125367 1 controller.go:312] start watching at 3264228 I1211 20:27:23.481695 1 controller.go:319] apiserver closed stream I1211 20:27:23.484054 1 controller.go:312] start watching at 3264228 I1211 20:28:26.814970 1 controller.go:319] apiserver closed stream I1211 20:28:26.816170 1 controller.go:312] start watching at 3264228 I1211 20:30:23.291601 1 controller.go:319] apiserver closed stream I1211 20:30:23.296327 1 controller.go:312] start watching at 3264228 I1211 20:30:23.299923 1 controller.go:312] start watching at 3264751 I1211 20:31:45.187460 1 controller.go:319] apiserver closed stream I1211 20:31:45.205060 1 controller.go:312] start watching at 3264751 I1211 20:33:31.950806 1 controller.go:319] apiserver closed stream I1211 20:33:31.951991 1 controller.go:312] start watching at 3264751 I1211 20:34:40.989161 1 controller.go:319] apiserver closed stream I1211 20:34:40.990196 1 controller.go:312] start watching at 3264751 I1211 20:36:38.194874 1 controller.go:319] apiserver closed stream I1211 20:36:38.198160 1 controller.go:312] start watching at 3264751 I1211 20:36:38.201074 1 controller.go:312] start watching at 3265467 I1211 20:37:41.805460 1 controller.go:319] apiserver closed stream I1211 20:37:41.806596 1 controller.go:312] start watching at 3265467 I1211 20:39:29.337385 1 controller.go:319] apiserver closed stream I1211 20:39:29.338606 1 controller.go:312] start watching at 3265467 I1211 20:41:23.525487 1 controller.go:319] apiserver closed stream I1211 20:41:23.530106 1 controller.go:312] start watching at 3265467 I1211 20:41:23.533485 1 controller.go:312] start watching at 3266013 I1211 20:42:48.507850 1 controller.go:319] apiserver closed stream I1211 20:42:48.509114 1 controller.go:312] start watching at 3266013 I1211 20:44:31.889275 1 controller.go:319] apiserver closed stream I1211 20:44:31.908745 1 controller.go:312] start watching at 3266013 I1211 20:46:28.043934 1 controller.go:319] apiserver closed stream I1211 20:46:28.048173 1 controller.go:312] start watching at 3266013 I1211 20:46:28.052156 1 controller.go:312] start watching at 3266593 I1211 20:48:15.505110 1 controller.go:319] apiserver closed stream I1211 20:48:15.506275 1 controller.go:312] start watching at 3266593 I1211 20:49:35.664074 1 controller.go:319] apiserver closed stream I1211 20:49:35.665176 1 controller.go:312] start watching at 3266593 I1211 20:51:02.888272 1 controller.go:319] apiserver closed stream I1211 20:51:02.892146 1 controller.go:312] start watching at 3266593 I1211 20:51:02.895821 1 controller.go:312] start watching at 3267119 I1211 20:52:27.170700 1 controller.go:319] apiserver closed stream I1211 20:52:27.172319 1 controller.go:312] start watching at 3267119 I1211 20:53:57.893014 1 controller.go:319] apiserver closed stream I1211 20:54:14.973201 1 controller.go:312] start watching at 3267119 I1211 20:55:50.394286 1 controller.go:319] apiserver closed stream I1211 20:55:51.684325 1 controller.go:312] start watching at 3267119 I1211 20:55:51.926557 1 controller.go:312] start watching at 3267631 I1211 20:57:50.162537 1 controller.go:319] apiserver closed stream I1211 20:57:50.188247 1 controller.go:312] start watching at 3267631 I1211 20:59:08.218085 1 controller.go:319] apiserver closed stream I1211 20:59:08.219319 1 controller.go:312] start watching at 3267631 I1211 21:00:24.749177 1 controller.go:319] apiserver closed stream I1211 21:00:24.755169 1 controller.go:312] start watching at 3267631 I1211 21:00:24.914419 1 controller.go:312] start watching at 3268150 I1211 21:01:55.879921 1 controller.go:319] apiserver closed stream I1211 21:01:55.881196 1 controller.go:312] start watching at 3268150 I1211 21:03:53.551644 1 controller.go:319] apiserver closed stream I1211 21:03:53.552878 1 controller.go:312] start watching at 3268150 I1211 21:05:41.927710 1 controller.go:319] apiserver closed stream I1211 21:05:41.931135 1 controller.go:312] start watching at 3268150 I1211 21:05:41.934156 1 controller.go:312] start watching at 3268754 I1211 21:06:56.980032 1 controller.go:319] apiserver closed stream I1211 21:06:56.983542 1 controller.go:312] start watching at 3268754 I1211 21:07:59.697971 1 controller.go:319] apiserver closed stream I1211 21:07:59.699328 1 controller.go:312] start watching at 3268754 I1211 21:09:37.830895 1 controller.go:319] apiserver closed stream I1211 21:09:37.833497 1 controller.go:312] start watching at 3268754 I1211 21:11:26.003280 1 controller.go:319] apiserver closed stream I1211 21:11:26.008035 1 controller.go:312] start watching at 3268754 I1211 21:11:26.011377 1 controller.go:312] start watching at 3269413 I1211 21:13:00.796435 1 controller.go:319] apiserver closed stream I1211 21:13:00.798030 1 controller.go:312] start watching at 3269413 I1211 21:14:57.968947 1 controller.go:319] apiserver closed stream I1211 21:14:57.972638 1 controller.go:312] start watching at 3269413 I1211 21:14:57.975787 1 controller.go:312] start watching at 3269818 I1211 21:15:58.110498 1 controller.go:319] apiserver closed stream I1211 21:15:58.111497 1 controller.go:312] start watching at 3269818 I1211 21:17:47.503760 1 controller.go:319] apiserver closed stream I1211 21:17:47.504907 1 controller.go:312] start watching at 3269818 I1211 21:19:21.561419 1 controller.go:319] apiserver closed stream I1211 21:19:21.562479 1 controller.go:312] start watching at 3269818 I1211 21:20:30.339157 1 controller.go:319] apiserver closed stream I1211 21:20:30.343377 1 controller.go:312] start watching at 3269818 I1211 21:20:30.346585 1 controller.go:312] start watching at 3270454 I1211 21:21:39.422328 1 controller.go:319] apiserver closed stream I1211 21:21:39.423546 1 controller.go:312] start watching at 3270454 I1211 21:22:45.975354 1 controller.go:319] apiserver closed stream I1211 21:22:45.976517 1 controller.go:312] start watching at 3270454 I1211 21:24:45.727626 1 controller.go:319] apiserver closed stream I1211 21:24:45.728880 1 controller.go:312] start watching at 3270454 I1211 21:26:14.313131 1 controller.go:319] apiserver closed stream I1211 21:26:14.318173 1 controller.go:312] start watching at 3270454 I1211 21:26:14.321509 1 controller.go:312] start watching at 3271110 I1211 21:27:25.914833 1 controller.go:319] apiserver closed stream I1211 21:27:25.916140 1 controller.go:312] start watching at 3271110 I1211 21:28:36.455020 1 controller.go:319] apiserver closed stream I1211 21:28:36.484327 1 controller.go:312] start watching at 3271110 I1211 21:30:15.548877 1 controller.go:319] apiserver closed stream I1211 21:30:15.629749 1 controller.go:312] start watching at 3271110 I1211 21:30:15.864997 1 controller.go:312] start watching at 3271571 I1211 21:31:48.277053 1 controller.go:319] apiserver closed stream I1211 21:31:48.278387 1 controller.go:312] start watching at 3271571 I1211 21:32:53.749268 1 controller.go:319] apiserver closed stream I1211 21:32:53.793130 1 controller.go:312] start watching at 3271571 I1211 21:34:35.269635 1 controller.go:319] apiserver closed stream I1211 21:34:35.270968 1 controller.go:312] start watching at 3271571 I1211 21:35:54.775305 1 controller.go:319] apiserver closed stream I1211 21:35:54.779429 1 controller.go:312] start watching at 3271571 I1211 21:35:54.970479 1 controller.go:312] start watching at 3272218 I1211 21:37:21.296912 1 controller.go:319] apiserver closed stream I1211 21:37:21.298252 1 controller.go:312] start watching at 3272218 I1211 21:38:29.959703 1 controller.go:319] apiserver closed stream I1211 21:38:29.961022 1 controller.go:312] start watching at 3272218 I1211 21:39:37.882824 1 controller.go:319] apiserver closed stream I1211 21:39:37.892033 1 controller.go:312] start watching at 3272218 I1211 21:40:38.516560 1 controller.go:349] event: ADDED { "RuntimeId": "", "tensorboard": { "logDir": "", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 3, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 2, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ] } I1211 21:40:38.516684 1 controller.go:350] TfJob event: ADDED { "RuntimeId": "", "tensorboard": { "logDir": "", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 3, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 2, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ] } E1211 21:40:38.530912 1 training.go:112] TfJob failed to setup: tbReplicaSpec.LogDir must be specified I1211 21:40:38.706580 1 controller.go:349] event: MODIFIED { "RuntimeId": "", "tensorboard": { "logDir": "", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 3, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 2, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 21:40:38.706685 1 controller.go:350] TfJob event: MODIFIED { "RuntimeId": "", "tensorboard": { "logDir": "", "volumes": null, "volumeMounts": null, "serviceType": "" }, "replicaSpecs": [ { "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "MASTER", "IsDefaultPS": false }, { "replicas": 3, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant_gpu:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": { "limits": { "nvidia.com/gpu": "1" } } } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "WORKER", "IsDefaultPS": false }, { "replicas": 2, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "name": "tensorflow", "image": "gcr.io/deepvariant-docker/deepvariant:0.4.0", "command": [ "python", "/opt/deepvariant/bin/model_train.zip", "--dataset_config_pbtxt=", "--start_from_checkpoint=", "--master=", "--train_dir=", "--num_retries=100", "--alsologtostderr" ], "resources": {} } ], "restartPolicy": "OnFailure" } }, "tfPort": 2222, "tfReplicaType": "PS", "IsDefaultPS": false } ], "tfImage": "tensorflow/tensorflow:1.3.0" } I1211 21:40:57.899834 1 controller.go:319] apiserver closed stream I1211 21:40:57.963273 1 controller.go:312] start watching at 3272765 I1211 21:42:26.926486 1 controller.go:319] apiserver closed stream I1211 21:42:26.927881 1 controller.go:312] start watching at 3272765 I1211 21:44:20.160188 1 controller.go:319] apiserver closed stream I1211 21:44:20.162232 1 controller.go:312] start watching at 3272765 I1211 21:45:41.608424 1 controller.go:319] apiserver closed stream I1211 21:45:41.612300 1 controller.go:312] start watching at 3272765 I1211 21:45:42.183629 1 controller.go:173] finding existing jobs... I1211 21:45:42.186273 1 controller.go:89] Starting watch at version %v3273345 I1211 21:45:42.186303 1 controller.go:98] starts running from watch version: 3273345 E1211 21:45:42.186338 1 training.go:112] TfJob failed to setup: tbReplicaSpec.LogDir must be specified I1211 21:45:42.187253 1 controller.go:312] start watching at 3273345 I1211 21:47:29.879605 1 controller.go:319] apiserver closed stream I1211 21:47:29.880892 1 controller.go:312] start watching at 3273345 I1211 21:48:51.590808 1 controller.go:319] apiserver closed stream I1211 21:48:51.593488 1 controller.go:312] start watching at 3273345 I1211 21:50:48.302539 1 controller.go:319] apiserver closed stream I1211 21:50:48.304288 1 controller.go:312] start watching at 3273345 I1211 21:50:48.308201 1 controller.go:312] start watching at 3273929 I1211 21:52:20.482450 1 controller.go:319] apiserver closed stream I1211 21:52:20.483983 1 controller.go:312] start watching at 3273929