Skip to content

Commit

Permalink
Add terminationPolicy to TfJobSpec (#204)
Browse files Browse the repository at this point in the history
* First PR to fix #192 (Don't require master or chief.)

* add terminationPolicy to TfjobSpec

* terminationPolicy should default to using a chief corresponding to the master.
  • Loading branch information
lluunn authored and jlewi committed Dec 11, 2017
1 parent 76fcb3d commit 08ec97f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 6 deletions.
41 changes: 35 additions & 6 deletions pkg/spec/tf_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ func (c *TfJob) AsOwner() metav1.OwnerReference {
// TODO: In 1.6 this is gonna be "k8s.io/kubernetes/pkg/apis/meta/v1"
// Both api.OwnerReference and metatypes.OwnerReference are combined into that.
return metav1.OwnerReference{
APIVersion: c.APIVersion,
Kind: c.Kind,
Name: c.Metadata.Name,
UID: c.Metadata.UID,
Controller: &trueVar,
BlockOwnerDeletion: &trueVar,
APIVersion: c.APIVersion,
Kind: c.Kind,
Name: c.Metadata.Name,
UID: c.Metadata.UID,
Controller: &trueVar,
BlockOwnerDeletion: &trueVar,
}
}

Expand All @@ -65,6 +65,9 @@ type TfJobSpec struct {
// TfImage defines the tensorflow docker image that should be used for Tensorboard
// and the default parameter server
TfImage string `json:"tfImage,omitempty"`

// TerminationPolicy specifies the condition that the tfjob should be considered finished.
TerminationPolicy *TerminationPolicySpec `json:"terminationPolicy,omitempty"`
}

// TfReplicaType determines how a set of TF processes are handled.
Expand Down Expand Up @@ -109,6 +112,16 @@ type TensorBoardSpec struct {
ServiceType v1.ServiceType `json:"serviceType"`
}

type TerminationPolicySpec struct {
// Chief policy waits for a particular process (which is the chief) to exit.
Chief *ChiefSpec `json:"chief,omitempty"`
}

type ChiefSpec struct {
ReplicaName string `json:"replicaName"`
ReplicaIndex int `json:"replicaIndex"`
}

// Validate checks that the TfJobSpec is valid.
func (c *TfJobSpec) Validate() error {
// Check that each replica has a TensorFlow container.
Expand Down Expand Up @@ -151,6 +164,14 @@ func (c *TfJobSpec) Validate() error {
return fmt.Errorf("Replica type %v is missing a container named %v", r.TfReplicaType, TENSORFLOW)
}
}
if c.TerminationPolicy != nil {
if c.TerminationPolicy.Chief == nil {
return errors.New("invalid termination policy, Chief cannot be nil")
}
if c.TerminationPolicy.Chief.ReplicaName != "MASTER" || c.TerminationPolicy.Chief.ReplicaIndex != 0 {
return errors.New("invalid termination policy, Chief should have replicaName=MASTER and index=0")
}
}
return nil
}

Expand Down Expand Up @@ -240,6 +261,14 @@ func (c *TfJobSpec) SetDefaults() error {
r.setDefaultPSPodTemplateSpec(c.TfImage)
}
}
if c.TerminationPolicy == nil {
c.TerminationPolicy = &TerminationPolicySpec{
Chief: &ChiefSpec{
ReplicaName: "MASTER",
ReplicaIndex: 0,
},
}
}
return nil
}

Expand Down
12 changes: 12 additions & 0 deletions pkg/spec/tf_job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,12 @@ func TestSetDefaults(t *testing.T) {
},
},
TfImage: "tensorflow/tensorflow:1.3.0",
TerminationPolicy: &TerminationPolicySpec{
Chief: &ChiefSpec{
ReplicaName: "MASTER",
ReplicaIndex: 0,
},
},
},
},
{
Expand Down Expand Up @@ -312,6 +318,12 @@ func TestSetDefaults(t *testing.T) {
},
},
TfImage: "tensorflow/tensorflow:1.3.0",
TerminationPolicy: &TerminationPolicySpec{
Chief: &ChiefSpec{
ReplicaName: "MASTER",
ReplicaIndex: 0,
},
},
},
},
}
Expand Down

0 comments on commit 08ec97f

Please sign in to comment.