kubeflow · Jeffwan · Aug 5, 2021 · Aug 3, 2021
diff --git a/config/crd/bases/kubeflow.org_mxjobs.yaml b/config/crd/bases/kubeflow.org_mxjobs.yaml
@@ -36,20 +36,6 @@ spec:
           spec:
             description: MXJobSpec defines the desired state of MXJob
             properties:
-              activeDeadlineSeconds:
-                description: Specifies the duration in seconds relative to the startTime
-                  that the job may be active before the system tries to terminate
-                  it; value must be positive integer.
-                format: int64
-                type: integer
-              backoffLimit:
-                description: Optional number of retries before marking this job failed.
-                format: int32
-                type: integer
-              cleanPodPolicy:
-                description: CleanPodPolicy defines the policy to kill pods after
-                  the job completes. Default to Running.
-                type: string
               jobMode:
                 description: JobMode specify the kind of MXjob to do. Different mode
                   may have different MXReplicaSpecs request
@@ -6773,34 +6759,55 @@ spec:
                   common.ReplicaSpec,     "Server": common.ReplicaSpec,     "Worker":
                   common.ReplicaSpec,   }'
                 type: object
-              schedulingPolicy:
-                description: SchedulingPolicy defines the policy related to scheduling,
-                  e.g. gang-scheduling
+              runPolicy:
+                description: RunPolicy encapsulates various runtime policies of the
+                  distributed training job, for example how to clean up resources
+                  and how long the job can stay active.
                 properties:
-                  minAvailable:
+                  activeDeadlineSeconds:
+                    description: Specifies the duration in seconds relative to the
+                      startTime that the job may be active before the system tries
+                      to terminate it; value must be positive integer.
+                    format: int64
+                    type: integer
+                  backoffLimit:
+                    description: Optional number of retries before marking this job
+                      failed.
                     format: int32
                     type: integer
-                  minResources:
-                    additionalProperties:
-                      anyOf:
-                      - type: integer
-                      - type: string
-                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                      x-kubernetes-int-or-string: true
-                    description: ResourceList is a set of (resource name, quantity)
-                      pairs.
-                    type: object
-                  priorityClass:
-                    type: string
-                  queue:
+                  cleanPodPolicy:
+                    description: CleanPodPolicy defines the policy to kill pods after
+                      the job completes. Default to Running.
                     type: string
+                  schedulingPolicy:
+                    description: SchedulingPolicy defines the policy related to scheduling,
+                      e.g. gang-scheduling
+                    properties:
+                      minAvailable:
+                        format: int32
+                        type: integer
+                      minResources:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: ResourceList is a set of (resource name, quantity)
+                          pairs.
+                        type: object
+                      priorityClass:
+                        type: string
+                      queue:
+                        type: string
+                    type: object
+                  ttlSecondsAfterFinished:
+                    description: TTLSecondsAfterFinished is the TTL to clean up jobs.
+                      It may take extra ReconcilePeriod seconds for the cleanup, since
+                      reconcile gets called periodically. Default to infinite.
+                    format: int32
+                    type: integer
                 type: object
-              ttlSecondsAfterFinished:
-                description: TTLSecondsAfterFinished is the TTL to clean up jobs.
-                  It may take extra ReconcilePeriod seconds for the cleanup, since
-                  reconcile gets called periodically. Default to infinite.
-                format: int32
-                type: integer
             required:
             - jobMode
             - mxReplicaSpecs

diff --git a/config/crd/bases/kubeflow.org_pytorchjobs.yaml b/config/crd/bases/kubeflow.org_pytorchjobs.yaml
@@ -31,24 +31,12 @@ spec:
               object represents. Servers may infer this from the endpoint the client
               submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
             type: string
+          metadata:
+            description: Standard Kubernetes object's metadata.
+            type: object
           spec:
             description: Specification of the desired state of the PyTorchJob.
             properties:
-              activeDeadlineSeconds:
-                description: Specifies the duration (in seconds) since startTime during
-                  which the job can remain active before it is terminated. Must be
-                  a positive integer. This setting applies only to pods where restartPolicy
-                  is OnFailure or Always.
-                format: int64
-                type: integer
-              backoffLimit:
-                description: Number of retries before marking this job as failed.
-                format: int32
-                type: integer
-              cleanPodPolicy:
-                description: Defines the policy for cleaning up pods after the PyTorchJob
-                  completes. Defaults to None.
-                type: string
               pytorchReplicaSpecs:
                 additionalProperties:
                   description: ReplicaSpec is a description of the replica
@@ -6767,13 +6755,55 @@ spec:
                   Specifies the PyTorch cluster configuration. For example,   {     "Master":
                   PyTorchReplicaSpec,     "Worker": PyTorchReplicaSpec,   }'
                 type: object
-              ttlSecondsAfterFinished:
-                description: Defines the TTL for cleaning up finished PyTorchJobs
-                  (temporary before Kubernetes adds the cleanup controller). It may
-                  take extra ReconcilePeriod seconds for the cleanup, since reconcile
-                  gets called periodically. Defaults to infinite.
-                format: int32
-                type: integer
+              runPolicy:
+                description: RunPolicy encapsulates various runtime policies of the
+                  distributed training job, for example how to clean up resources
+                  and how long the job can stay active.
+                properties:
+                  activeDeadlineSeconds:
+                    description: Specifies the duration in seconds relative to the
+                      startTime that the job may be active before the system tries
+                      to terminate it; value must be positive integer.
+                    format: int64
+                    type: integer
+                  backoffLimit:
+                    description: Optional number of retries before marking this job
+                      failed.
+                    format: int32
+                    type: integer
+                  cleanPodPolicy:
+                    description: CleanPodPolicy defines the policy to kill pods after
+                      the job completes. Default to Running.
+                    type: string
+                  schedulingPolicy:
+                    description: SchedulingPolicy defines the policy related to scheduling,
+                      e.g. gang-scheduling
+                    properties:
+                      minAvailable:
+                        format: int32
+                        type: integer
+                      minResources:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: ResourceList is a set of (resource name, quantity)
+                          pairs.
+                        type: object
+                      priorityClass:
+                        type: string
+                      queue:
+                        type: string
+                    type: object
+                  ttlSecondsAfterFinished:
+                    description: TTLSecondsAfterFinished is the TTL to clean up jobs.
+                      It may take extra ReconcilePeriod seconds for the cleanup, since
+                      reconcile gets called periodically. Default to infinite.
+                    format: int32
+                    type: integer
+                type: object
             required:
             - pytorchReplicaSpecs
             type: object

diff --git a/config/crd/bases/kubeflow.org_tfjobs.yaml b/config/crd/bases/kubeflow.org_tfjobs.yaml
@@ -31,47 +31,63 @@ spec:
               object represents. Servers may infer this from the endpoint the client
               submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
             type: string
+          metadata:
+            description: Standard Kubernetes object's metadata.
+            type: object
           spec:
             description: Specification of the desired state of the TFJob.
             properties:
-              activeDeadlineSeconds:
-                description: Specifies the duration in seconds relative to the startTime
-                  that the job may be active before the system tries to terminate
-                  it; value must be positive integer.
-                format: int64
-                type: integer
-              backoffLimit:
-                description: Optional number of retries before marking this job failed.
-                format: int32
-                type: integer
-              cleanPodPolicy:
-                description: CleanPodPolicy defines the policy to kill pods after
-                  the job completes. Default to Running.
-                type: string
               enableDynamicWorker:
                 description: A switch to enable dynamic worker
                 type: boolean
-              schedulingPolicy:
-                description: SchedulingPolicy defines the policy related to scheduling,
-                  e.g. gang-scheduling
+              runPolicy:
+                description: RunPolicy encapsulates various runtime policies of the
+                  distributed training job, for example how to clean up resources
+                  and how long the job can stay active.
                 properties:
-                  minAvailable:
+                  activeDeadlineSeconds:
+                    description: Specifies the duration in seconds relative to the
+                      startTime that the job may be active before the system tries
+                      to terminate it; value must be positive integer.
+                    format: int64
+                    type: integer
+                  backoffLimit:
+                    description: Optional number of retries before marking this job
+                      failed.
                     format: int32
                     type: integer
-                  minResources:
-                    additionalProperties:
-                      anyOf:
-                      - type: integer
-                      - type: string
-                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                      x-kubernetes-int-or-string: true
-                    description: ResourceList is a set of (resource name, quantity)
-                      pairs.
-                    type: object
-                  priorityClass:
-                    type: string
-                  queue:
+                  cleanPodPolicy:
+                    description: CleanPodPolicy defines the policy to kill pods after
+                      the job completes. Default to Running.
                     type: string
+                  schedulingPolicy:
+                    description: SchedulingPolicy defines the policy related to scheduling,
+                      e.g. gang-scheduling
+                    properties:
+                      minAvailable:
+                        format: int32
+                        type: integer
+                      minResources:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: ResourceList is a set of (resource name, quantity)
+                          pairs.
+                        type: object
+                      priorityClass:
+                        type: string
+                      queue:
+                        type: string
+                    type: object
+                  ttlSecondsAfterFinished:
+                    description: TTLSecondsAfterFinished is the TTL to clean up jobs.
+                      It may take extra ReconcilePeriod seconds for the cleanup, since
+                      reconcile gets called periodically. Default to infinite.
+                    format: int32
+                    type: integer
                 type: object
               successPolicy:
                 description: SuccessPolicy defines the policy to mark the TFJob as
@@ -6795,12 +6811,6 @@ spec:
                   Specifies the TF cluster configuration. For example,   {     "PS":
                   ReplicaSpec,     "Worker": ReplicaSpec,   }'
                 type: object
-              ttlSecondsAfterFinished:
-                description: TTLSecondsAfterFinished is the TTL to clean up jobs.
-                  It may take extra ReconcilePeriod seconds for the cleanup, since
-                  reconcile gets called periodically. Default to infinite.
-                format: int32
-                type: integer
             required:
             - tfReplicaSpecs
             type: object