kubeflow · google-oss-prow · Nov 7, 2022 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ run distributed or non-distributed TensorFlow/PyTorch/Apache MXNet/XGBoost/MPI j
   - [Apache MXNet API Definition](pkg/apis/kubeflow.org/v1/mxnet_types.go)
   - [XGBoost API Definition](pkg/apis/kubeflow.org/v1/xgboost_types.go)
   - [MPI API Definition](pkg/apis/kubeflow.org/v1/mpi_types.go)
+  - [PaddlePaddle API Definition](pkg/apis/kubeflow.org/v1/paddlepaddle_types.go)
 - For details on API design, please refer to the [v1alpha2 design doc](https://github.com/kubeflow/community/blob/master/proposals/tf-operator-design-v1alpha2.md).
 - For details of all-in-one operator design, please refer to the [All-in-one Kubeflow Training Operator](https://docs.google.com/document/d/1x1JPDQfDMIbnoQRftDH1IzGU0qvHGSU4W6Jl4rJLPhI/edit#heading=h.e33ufidnl8z6)
 - For details on its observability, please refer to the [monitoring design doc](docs/monitoring/README.md).

diff --git a/cmd/training-operator.v1/main.go b/cmd/training-operator.v1/main.go
@@ -67,7 +67,7 @@ func main() {
 			"Enabling this will ensure there is only one active controller manager.")
 	flag.StringVar(&leaderElectionID, "leader-election-id", "1ca428e5.training-operator.kubeflow.org", "The ID for leader election.")
 	flag.Var(&enabledSchemes, "enable-scheme", "Enable scheme(s) as --enable-scheme=tfjob --enable-scheme=pytorchjob, case insensitive."+
-		" Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob. By default, all supported schemes will be enabled.")
+		" Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob, PaddleJob. By default, all supported schemes will be enabled.")
 	flag.BoolVar(&enableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling")
 	flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "volcano", "The scheduler to gang-schedule kubeflow jobs, defaults to volcano")
 	flag.StringVar(&namespace, "namespace", os.Getenv(commonutil.EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+

diff --git a/docs/api/kubeflow.org_v1_generated.asciidoc b/docs/api/kubeflow.org_v1_generated.asciidoc
@@ -20,6 +20,8 @@ Package v1 contains API Schema definitions for the kubeflow.org v1 API group
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-mpijoblist[$$MPIJobList$$]
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-mxjob[$$MXJob$$]
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-mxjoblist[$$MXJobList$$]
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejob[$$PaddleJob$$]
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejoblist[$$PaddleJobList$$]
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-pytorchjob[$$PyTorchJob$$]
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-pytorchjoblist[$$PyTorchJobList$$]
 - xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-tfjob[$$TFJob$$]
@@ -195,6 +197,87 @@ MXJobSpec defines the desired state of MXJob
 
 
 
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddleelasticpolicy"]
+==== PaddleElasticPolicy 
+
+
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejobspec[$$PaddleJobSpec$$]
+****
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`minReplicas`* __integer__ | minReplicas is the lower limit for the number of replicas to which the training job can scale down.  It defaults to null.
+| *`maxReplicas`* __integer__ | upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas, defaults to null.
+| *`maxRestarts`* __integer__ | MaxRestarts is the limit for restart times of pods in elastic mode.
+| *`metrics`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#metricspec-v2beta2-autoscaling[$$MetricSpec$$] array__ | Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.
+|===
+
+
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejob"]
+==== PaddleJob 
+
+PaddleJob Represents a PaddleJob resource.
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejoblist[$$PaddleJobList$$]
+****
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`apiVersion`* __string__ | `kubeflow.org/v1`
+| *`kind`* __string__ | `PaddleJob`
+| *`TypeMeta`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#typemeta-v1-meta[$$TypeMeta$$]__ | Standard Kubernetes type metadata.
+| *`metadata`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#objectmeta-v1-meta[$$ObjectMeta$$]__ | Refer to Kubernetes API documentation for fields of `metadata`.
+
+| *`spec`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejobspec[$$PaddleJobSpec$$]__ | Specification of the desired state of the PaddleJob.
+| *`status`* __xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-jobstatus[$$JobStatus$$]__ | Most recently observed status of the PaddleJob. Read-only (modified by the system).
+|===
+
+
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejoblist"]
+==== PaddleJobList 
+
+PaddleJobList is a list of PaddleJobs.
+
+
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`apiVersion`* __string__ | `kubeflow.org/v1`
+| *`kind`* __string__ | `PaddleJobList`
+| *`TypeMeta`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#typemeta-v1-meta[$$TypeMeta$$]__ | Standard type metadata.
+| *`metadata`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#listmeta-v1-meta[$$ListMeta$$]__ | Refer to Kubernetes API documentation for fields of `metadata`.
+
+| *`items`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejob[$$PaddleJob$$] array__ | List of PaddleJobs.
+|===
+
+
+[id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejobspec"]
+==== PaddleJobSpec 
+
+PaddleJobSpec is a desired state description of the PaddleJob.
+
+.Appears In:
+****
+- xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddlejob[$$PaddleJob$$]
+****
+
+[cols="25a,75a", options="header"]
+|===
+| Field | Description
+| *`runPolicy`* __xref:{anchor_prefix}-github-com-kubeflow-common-pkg-apis-common-v1-runpolicy[$$RunPolicy$$]__ | RunPolicy encapsulates various runtime policies of the distributed training job, for example how to clean up resources and how long the job can stay active.
+| *`elasticPolicy`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-paddleelasticpolicy[$$PaddleElasticPolicy$$]__ | ElasticPolicy holds the elastic policy for paddle job.
+| *`paddleReplicaSpecs`* __object (keys:ReplicaType, values:ReplicaSpec)__ | A map of PaddleReplicaType (type) to ReplicaSpec (value). Specifies the Paddle cluster configuration. For example,   {     "Master": PaddleReplicaSpec,     "Worker": PaddleReplicaSpec,   }
+|===
+
+
 [id="{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-pytorchjob"]
 ==== PyTorchJob 
 

diff --git a/examples/paddlepaddle/simple-cpu.yaml b/examples/paddlepaddle/simple-cpu.yaml
@@ -0,0 +1,25 @@
+apiVersion: "kubeflow.org/v1"
+kind: PaddleJob
+metadata:
+  name: paddle-simple-cpu
+  namespace: kubeflow
+spec:
+  paddleReplicaSpecs:
+    Worker:
+      replicas: 2
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: paddle
+              image: registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-cpu
+              command:
+                - python
+              args:
+                - "-m"
+                - paddle.distributed.launch
+                - "run_check"
+              ports:
+                - containerPort: 37777
+                  name: master
+              imagePullPolicy: Always
diff --git a/examples/paddlepaddle/simple-gpu.yaml b/examples/paddlepaddle/simple-gpu.yaml
@@ -0,0 +1,36 @@
+apiVersion: "kubeflow.org/v1"
+kind: PaddleJob
+metadata:
+  name: paddle-simple-gpu
+  namespace: kubeflow
+spec:
+  paddleReplicaSpecs:
+    Worker:
+      replicas: 2
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: paddle
+              image: registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-gpu-cuda11.2-cudnn8.1-trt8.0
+              command:
+                - python
+              args:
+                - "-m"
+                - paddle.distributed.launch
+                - "run_check"
+              ports:
+                - containerPort: 37777
+                  name: master
+              imagePullPolicy: Always
+              resources:
+                  limits:
+                      nvidia.com/gpu: 2
+              volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+          volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+
diff --git a/hack/python-sdk/post_gen.py b/hack/python-sdk/post_gen.py
@@ -58,6 +58,7 @@ def add_imports() -> None:
         init_file.write("from kubeflow.training.api.xgboost_job_client import XGBoostJobClient\n")
         init_file.write("from kubeflow.training.api.mpi_job_client import MPIJobClient\n")
         init_file.write("from kubeflow.training.api.mx_job_client import MXJobClient\n")
+        init_file.write("from kubeflow.training.api.paddle_job_client import PaddleJobClient\n")
     with open(os.path.join(sdk_dir, "kubeflow/__init__.py"), "a") as init_file:
         init_file.write("__path__ = __import__('pkgutil').extend_path(__path__, __name__)")
 

diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json
@@ -235,6 +235,118 @@
       "description": "MXJobStatus defines the observed state of MXJob",
       "type": "object"
     },
+    "kubeflow.org.v1.PaddleElasticPolicy": {
+      "type": "object",
+      "properties": {
+        "maxReplicas": {
+          "description": "upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas, defaults to null.",
+          "type": "integer",
+          "format": "int32"
+        },
+        "maxRestarts": {
+          "description": "MaxRestarts is the limit for restart times of pods in elastic mode.",
+          "type": "integer",
+          "format": "int32"
+        },
+        "metrics": {
+          "description": "Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.",
+          "type": "array",
+          "items": {
+            "default": {},
+            "$ref": "#/definitions/k8s.io.api.autoscaling.v2beta2.MetricSpec"
+          }
+        },
+        "minReplicas": {
+          "description": "minReplicas is the lower limit for the number of replicas to which the training job can scale down.  It defaults to null.",
+          "type": "integer",
+          "format": "int32"
+        }
+      }
+    },
+    "kubeflow.org.v1.PaddleJob": {
+      "description": "PaddleJob Represents a PaddleJob resource.",
+      "type": "object",
+      "properties": {
+        "apiVersion": {
+          "description": "APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources",
+          "type": "string"
+        },
+        "kind": {
+          "description": "Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds",
+          "type": "string"
+        },
+        "metadata": {
+          "default": {},
+          "$ref": "#/definitions/v1.ObjectMeta"
+        },
+        "spec": {
+          "description": "Specification of the desired state of the PaddleJob.",
+          "default": {},
+          "$ref": "#/definitions/kubeflow.org.v1.PaddleJobSpec"
+        },
+        "status": {
+          "description": "Most recently observed status of the PaddleJob. Read-only (modified by the system).",
+          "default": {},
+          "$ref": "#/definitions/v1.JobStatus"
+        }
+      }
+    },
+    "kubeflow.org.v1.PaddleJobList": {
+      "description": "PaddleJobList is a list of PaddleJobs.",
+      "type": "object",
+      "required": [
+        "items"
+      ],
+      "properties": {
+        "apiVersion": {
+          "description": "APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources",
+          "type": "string"
+        },
+        "items": {
+          "description": "List of PaddleJobs.",
+          "type": "array",
+          "items": {
+            "default": {},
+            "$ref": "#/definitions/kubeflow.org.v1.PaddleJob"
+          }
+        },
+        "kind": {
+          "description": "Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds",
+          "type": "string"
+        },
+        "metadata": {
+          "description": "Standard list metadata.",
+          "default": {},
+          "$ref": "#/definitions/v1.ListMeta"
+        }
+      }
+    },
+    "kubeflow.org.v1.PaddleJobSpec": {
+      "description": "PaddleJobSpec is a desired state description of the PaddleJob.",
+      "type": "object",
+      "required": [
+        "runPolicy",
+        "paddleReplicaSpecs"
+      ],
+      "properties": {
+        "elasticPolicy": {
+          "description": "ElasticPolicy holds the elastic policy for paddle job.",
+          "$ref": "#/definitions/kubeflow.org.v1.PaddleElasticPolicy"
+        },
+        "paddleReplicaSpecs": {
+          "description": "A map of PaddleReplicaType (type) to ReplicaSpec (value). Specifies the Paddle cluster configuration. For example,\n  {\n    \"Master\": PaddleReplicaSpec,\n    \"Worker\": PaddleReplicaSpec,\n  }",
+          "type": "object",
+          "additionalProperties": {
+            "$ref": "#/definitions/v1.ReplicaSpec"
+          }
+        },
+        "runPolicy": {
+          "description": "RunPolicy encapsulates various runtime policies of the distributed training job, for example how to clean up resources and how long the job can stay active.",
+          "default": {},
+          "$ref": "#/definitions/v1.RunPolicy"
+        }
+      }
+    },
     "kubeflow.org.v1.PyTorchJob": {
       "description": "PyTorchJob Represents a PyTorchJob resource.",
       "type": "object",

diff --git a/hack/violation_exception.list b/hack/violation_exception.list
@@ -1,5 +1,6 @@
 API rule violation: list_type_missing,github.com/kubeflow/common/pkg/apis/common/v1,JobStatus,Conditions
 API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1,ElasticPolicy,Metrics
 API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1,ElasticPolicy,RDZVConf
+API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1,PaddleElasticPolicy,Metrics
 API rule violation: names_match,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1,ElasticPolicy,RDZVID
 API rule violation: names_match,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1,PyTorchJobSpec,PyTorchReplicaSpecs
diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml
@@ -14,16 +14,19 @@ rules:
       - mxjobs
       - pytorchjobs
       - xgboostjobs
+      - paddlejobs
       - mpijobs/status
       - tfjobs/status
       - pytorchjobs/status
       - mxjobs/status
       - xgboostjobs/status
+      - paddlejobs/status
       - mpijobs/finalizers
       - tfjobs/finalizers
       - pytorchjobs/finalizers
       - mxjobs/finalizers
       - xgboostjobs/finalizers
+      - paddlejobs/finalizers
     verbs:
       - create
       - delete