kubeflow · k8s-ci-robot · Nov 14, 2019 · Nov 8, 2019
diff --git a/components/kubeflow/katib-launcher/Dockerfile b/components/kubeflow/katib-launcher/Dockerfile
@@ -14,16 +14,13 @@
 FROM ubuntu:16.04
 
 RUN apt-get update -y && \
-    apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \
+    apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget && \
     easy_install pip && \
-    pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \
-    wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip
-
-ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/v1alpha1/python:/katib-master/py
+    pip install pyyaml==3.12 kubernetes
 
 ADD build /ml
 
 RUN mkdir /usr/licenses && \
     /ml/license.sh /ml/third_party_licenses.csv /usr/licenses
 
-ENTRYPOINT ["python", "/ml/launch_study_job.py"]
+ENTRYPOINT ["python", "/ml/launch_experiment.py"]
diff --git a/components/kubeflow/katib-launcher/build_image.sh b/components/kubeflow/katib-launcher/build_image.sh
@@ -38,7 +38,7 @@ rsync -arvp ./src/ ./build/
 cp ../../license.sh ./build
 cp ../../third_party_licenses.csv ./build
 
-LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob
+LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-experiment
 
 docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
 if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then

diff --git a/components/kubeflow/katib-launcher/component.yaml b/components/kubeflow/katib-launcher/component.yaml
@@ -1,40 +1,38 @@
-name: Kubeflow - Launch StudyJob
-description: Kubeflow StudyJob launcher
+name: Kubeflow - Launch Experiment
+description: Kubeflow Experiment launcher
 inputs:
-- {name: StudyJob name,             type: String,                       description: 'Job name.'}
-- {name: Namespace,                 type: String,   default: kubeflow,  description: 'Namespace.'}
-- {name: Optimization type,         type: String,   default: minimize,  description: 'Direction of optimization. minimize or maximize.'}
-- {name: Objective value name,      type: String,                       description: 'Objective value name which trainer optimizes.'}
-- {name: Optimization goal,         type: Float,                        description: 'Stop studying once objectivevaluename value exceeds optimizationgoal.'}
-- {name: Request count,             type: Integer,  default: 1,         description: 'Number of requests to the suggestion service.'}
-- {name: Metrics names,             type: String,                       description: 'List of metric names (comma-delimited).'}
-- {name: Parameter configs,         type: YAML,     default: '',        description: 'Parameter configs (YAML/JSON format).'}
-- {name: NAS config,                type: YAML,     default: '',        description: 'NAS config (YAML/JSON format).'}
-- {name: Worker template path,      type: String,   default: '',        description: 'Worker spec.'}
-- {name: Metrics collector template path, type: String, default: '',    description: 'Metrics collector spec.'}
-- {name: Suggestion spec,           type: YAML,     default: '',        description: 'Suggestion spec (YAML/JSON format).'}
-- {name: StudyJob timeout minutes,  type: Integer,  default: '10',      description: 'Time in minutes to wait for the StudyJob to complete.'}
-- {name: Delete finished job,       type: Bool,  default: 'True',    description: 'Whether to delete the job after it is finished.'}
+- {name: Experiment Name,           type: String,                       description: 'Experiment name.'}
+- {name: Experiment Namespace,      type: String,   default: kubeflow,  description: 'Experiment namespace.'}
+- {name: Experiment Version,        type: String,   default: v1alpha3,  description: 'Experiment version.'}
+- {name: Max Trial Count,           type: Integer,                      description: 'How many trials can be executed at most.'}
+- {name: Max Failed Trial Count,    type: Integer,  default: 3,         description: 'How many trials can fail at most.'}
+- {name: Parallel Trial Count,      type: Integer,  default: 3,         description: 'How many trials can be running in parallel at most.'}
+- {name: Objective,                 type: JSON,                         description: 'Experiment objective.'}
+- {name: Algorithm,                 type: JSON,                         description: 'Experiment algorithm.'}
+- {name: Trial Template,            type: JSON,                         description: 'Experiment trialTemplate.'}
+- {name: Parameters,                type: JSON,                         description: 'Experiment Parameter configuration.'}
+- {name: Metrics Collector,         type: JSON,   default: '{}',        description: 'Experiment metricsCollector.'}
+- {name: Experiment Timeout Minutes, type: Integer,  default: 1440,     description: 'Time in minutes to wait for the Experiment to complete.'}
+- {name: Delete Finished Experiment, type: Bool,     default: 'True',   description: 'Whether to delete the experiment after it is finished.'}
 outputs:
-- {name: Best parameter set,        type: JSON,                         description: 'The parameter set of the best StudyJob trial.'}
+- {name: Best Parameter Set,        type: JSON,                         description: 'The parameter set of the best Experiment trial.'}
 implementation:
   container:
-    image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-studyjob:2c2445df83fa879387a200747cc20f72a7ee9727
-    command: [python, /ml/launch_study_job.py]
+    image: liuhougangxa/katib-experiment-launcher:latest
+    command: [python, /ml/launch_experiment.py]
     args: [
-      --name,                   {inputValue: StudyJob name},
-      --namespace,              {inputValue: Namespace},
-      --optimizationtype,       {inputValue: Optimization type},
-      --objectivevaluename,     {inputValue: Objective value name},
-      --optimizationgoal,       {inputValue: Optimization goal},
-      --requestcount,           {inputValue: Request count},
-      --metricsnames,           {inputValue: Metrics names},
-      --parameterconfigs,       {inputValue: Parameter configs},
-      --nasConfig,              {inputValue: NAS config},
-      --workertemplatepath,     {inputValue: Worker template path},
-      --mcollectortemplatepath, {inputValue: Metrics collector template path},
-      --suggestionspec,         {inputValue: Suggestion spec},
-      --studyjobtimeoutminutes, {inputValue: StudyJob timeout minutes},
-      --deleteAfterDone,        {inputValue: Delete finished job},
-      --outputfile,             {outputPath: Best parameter set},
+      --name,                     {inputValue: Experiment Name},
+      --namespace,                {inputValue: Experiment Namespace},
+      --version,                  {inputValue: Experiment Version},
+      --maxTrialCount,            {inputValue: Max Trial Count},
+      --maxFailedTrialCount,      {inputValue: Max Failed Trial Count},
+      --parallelTrialCount,       {inputValue: Parallel Trial Count},
+      --objectiveConfig,          {inputValue: Objective},
+      --algorithmConfig,          {inputValue: Algorithm},
+      --trialTemplate,            {inputValue: Trial Template},
+      --parameters,               {inputValue: Parameters},
+      --metricsCollector,         {inputValue: Metrics Collector},
+      --experimentTimeoutMinutes, {inputValue: Experiment Timeout Minutes},
+      --deleteAfterDone,          {inputValue: Delete Finished Experiment},
+      --outputFile,               {outputPath: Best Parameter Set},
     ]
diff --git a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py
diff --git a/components/kubeflow/katib-launcher/sample.py b/components/kubeflow/katib-launcher/sample.py
@@ -0,0 +1,114 @@
+import json
+import kfp.dsl as dsl
+
+@dsl.pipeline(
+    name="Launch katib experiment",
+    description="An example to launch katib experiment."
+)
+def mnist_hpo(
+        name="mnist",
+        namespace="kubeflow",
+        goal=0.99,
+        parallelTrialCount=3,
+        maxTrialCount=12,
+        experimentTimeoutMinutes=60,
+        deleteAfterDone=True):
+    objectiveConfig = {
+      "type": "maximize",
+      "goal": goal,
+      "objectiveMetricName": "Validation-accuracy",
+      "additionalMetricNames": ["accuracy"]
+    }
+    algorithmConfig = {"algorithmName" : "random"}
+    parameters = [
+      {"name": "--lr", "parameterType": "double", "feasibleSpace": {"min": "0.01","max": "0.03"}},
+      {"name": "--num-layers", "parameterType": "int", "feasibleSpace": {"min": "2", "max": "5"}},
+      {"name": "--optimizer", "parameterType": "categorical", "feasibleSpace": {"list": ["sgd", "adam", "ftrl"]}}
+    ]
+    rawTemplate = {
+      "apiVersion": "batch/v1",
+      "kind": "Job",
+      "metadata": {
+         "name": "{{.Trial}}",
+         "namespace": "{{.NameSpace}}"
+      },
+      "spec": {
+        "template": {
+          "spec": {
+            "restartPolicy": "Never",
+            "containers": [
+              {"name": "{{.Trial}}",
+               "image": "docker.io/katib/mxnet-mnist-example",
+               "command": [
+                   "python /mxnet/example/image-classification/train_mnist.py --batch-size=64 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
+               ]
+              }
+            ]
+          }
+        }
+      }
+    }
+    trialTemplate = {
+      "goTemplate": {
+        "rawTemplate": json.dumps(rawTemplate)
+      }
+    }
+    op1 = katib_experiment_launcher_op(
+            name,
+            namespace,
+            parallelTrialCount=parallelTrialCount,
+            maxTrialCount=maxTrialCount,
+            objectiveConfig=str(objectiveConfig),
+            algorithmConfig=str(algorithmConfig),
+            trialTemplate=str(trialTemplate),
+            parameters=str(parameters),
+            experimentTimeoutMinutes=experimentTimeoutMinutes,
+            deleteAfterDone=deleteAfterDone
+    )
+
+    op_out = dsl.ContainerOp(
+        name="my-out-cop",
+        image="library/bash:4.4.23",
+        command=["sh", "-c"],
+        arguments=["echo hyperparameter: %s" % op1.output],
+    )
+
+
+def katib_experiment_launcher_op(
+      name,
+      namespace,
+      maxTrialCount=100,
+      parallelTrialCount=3,
+      maxFailedTrialCount=3,
+      objectiveConfig='{}',
+      algorithmConfig='{}',
+      metricsCollector='{}',
+      trialTemplate='{}',
+      parameters='[]',
+      experimentTimeoutMinutes=60,
+      deleteAfterDone=True,
+      outputFile='/output.txt'):
+    return dsl.ContainerOp(
+        name = "mnist-hpo",
+        image = 'liuhougangxa/katib-experiment-launcher:latest',
+        arguments = [
+            '--name', name,
+            '--namespace', namespace,
+            '--maxTrialCount', maxTrialCount,
+            '--maxFailedTrialCount', maxFailedTrialCount,
+            '--parallelTrialCount', parallelTrialCount,
+            '--objectiveConfig', objectiveConfig,
+            '--algorithmConfig', algorithmConfig,
+            '--metricsCollector', metricsCollector,
+            '--trialTemplate', trialTemplate,
+            '--parameters', parameters,
+            '--outputFile', outputFile,
+            '--deleteAfterDone', deleteAfterDone,
+            '--experimentTimeoutMinutes', experimentTimeoutMinutes,
+        ],
+        file_outputs = {'bestHyperParameter': outputFile}
+    )
+
+if __name__ == "__main__":
+    import kfp.compiler as compiler
+    compiler.Compiler().compile(mnist_hpo, __file__ + ".tar.gz")
diff --git a/components/kubeflow/katib-launcher/sample2.py b/components/kubeflow/katib-launcher/sample2.py
@@ -0,0 +1,80 @@
+import json
+from kfp import components
+import kfp.dsl as dsl
+
+@dsl.pipeline(
+    name="Launch katib experiment",
+    description="An example to launch katib experiment."
+)
+def mnist_hpo(
+        name="mnist",
+        namespace="kubeflow",
+        goal=0.99,
+        parallelTrialCount=3,
+        maxTrialCount=12,
+        experimentTimeoutMinutes=60,
+        deleteAfterDone=True):
+    objectiveConfig = {
+      "type": "maximize",
+      "goal": goal,
+      "objectiveMetricName": "Validation-accuracy",
+      "additionalMetricNames": ["accuracy"]
+    }
+    algorithmConfig = {"algorithmName" : "random"}
+    parameters = [
+      {"name": "--lr", "parameterType": "double", "feasibleSpace": {"min": "0.01","max": "0.03"}},
+      {"name": "--num-layers", "parameterType": "int", "feasibleSpace": {"min": "2", "max": "5"}},
+      {"name": "--optimizer", "parameterType": "categorical", "feasibleSpace": {"list": ["sgd", "adam", "ftrl"]}}
+    ]
+    rawTemplate = {
+      "apiVersion": "batch/v1",
+      "kind": "Job",
+      "metadata": {
+         "name": "{{.Trial}}",
+         "namespace": "{{.NameSpace}}"
+      },
+      "spec": {
+        "template": {
+          "spec": {
+            "restartPolicy": "Never",
+            "containers": [
+              {"name": "{{.Trial}}",
+               "image": "docker.io/katib/mxnet-mnist-example",
+               "command": [
+                   "python /mxnet/example/image-classification/train_mnist.py --batch-size=64 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
+               ]
+              }
+            ]
+          }
+        }
+      }
+    }
+    trialTemplate = {
+      "goTemplate": {
+        "rawTemplate": json.dumps(rawTemplate)
+      }
+    }
+    katib_experiment_launcher_op = components.load_component_from_file("./component.yaml")
+    # katib_experiment_launcher_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml')
+    op1 = katib_experiment_launcher_op(
+            experiment_name=name,
+            experiment_namespace=namespace,
+            parallel_trial_count=parallelTrialCount,
+            max_trial_count=maxTrialCount,
+            objective=str(objectiveConfig),
+            algorithm=str(algorithmConfig),
+            trial_template=str(trialTemplate),
+            parameters=str(parameters),
+            experiment_timeout_minutes=experimentTimeoutMinutes,
+            delete_finished_experiment=deleteAfterDone)
+
+    op_out = dsl.ContainerOp(
+        name="my-out-cop",
+        image="library/bash:4.4.23",
+        command=["sh", "-c"],
+        arguments=["echo hyperparameter: %s" % op1.output],
+    )
+
+if __name__ == "__main__":
+    import kfp.compiler as compiler
+    compiler.Compiler().compile(mnist_hpo, __file__ + ".tar.gz")
diff --git a/components/kubeflow/katib-launcher/src/__init__.py b/components/kubeflow/katib-launcher/src/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op
diff --git a/components/kubeflow/katib-launcher/src/hp.template.yaml b/components/kubeflow/katib-launcher/src/hp.template.yaml