Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Katib experiment launcher #2577

Merged
merged 1 commit into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions components/kubeflow/katib-launcher/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,13 @@
FROM ubuntu:16.04

RUN apt-get update -y && \
apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \
apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget && \
easy_install pip && \
pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \
wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip

ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/v1alpha1/python:/katib-master/py
pip install pyyaml==3.12 kubernetes

ADD build /ml

RUN mkdir /usr/licenses && \
/ml/license.sh /ml/third_party_licenses.csv /usr/licenses

ENTRYPOINT ["python", "/ml/launch_study_job.py"]
ENTRYPOINT ["python", "/ml/launch_experiment.py"]
2 changes: 1 addition & 1 deletion components/kubeflow/katib-launcher/build_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ rsync -arvp ./src/ ./build/
cp ../../license.sh ./build
cp ../../third_party_licenses.csv ./build

LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob
LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-experiment

docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
Expand Down
66 changes: 32 additions & 34 deletions components/kubeflow/katib-launcher/component.yaml
Original file line number Diff line number Diff line change
@@ -1,40 +1,38 @@
name: Kubeflow - Launch StudyJob
description: Kubeflow StudyJob launcher
name: Kubeflow - Launch Experiment
description: Kubeflow Experiment launcher
inputs:
- {name: StudyJob name, type: String, description: 'Job name.'}
- {name: Namespace, type: String, default: kubeflow, description: 'Namespace.'}
- {name: Optimization type, type: String, default: minimize, description: 'Direction of optimization. minimize or maximize.'}
- {name: Objective value name, type: String, description: 'Objective value name which trainer optimizes.'}
- {name: Optimization goal, type: Float, description: 'Stop studying once objectivevaluename value exceeds optimizationgoal.'}
- {name: Request count, type: Integer, default: 1, description: 'Number of requests to the suggestion service.'}
- {name: Metrics names, type: String, description: 'List of metric names (comma-delimited).'}
- {name: Parameter configs, type: YAML, default: '', description: 'Parameter configs (YAML/JSON format).'}
- {name: NAS config, type: YAML, default: '', description: 'NAS config (YAML/JSON format).'}
- {name: Worker template path, type: String, default: '', description: 'Worker spec.'}
- {name: Metrics collector template path, type: String, default: '', description: 'Metrics collector spec.'}
- {name: Suggestion spec, type: YAML, default: '', description: 'Suggestion spec (YAML/JSON format).'}
- {name: StudyJob timeout minutes, type: Integer, default: '10', description: 'Time in minutes to wait for the StudyJob to complete.'}
- {name: Delete finished job, type: Bool, default: 'True', description: 'Whether to delete the job after it is finished.'}
- {name: Experiment Name, type: String, description: 'Experiment name.'}
- {name: Experiment Namespace, type: String, default: kubeflow, description: 'Experiment namespace.'}
- {name: Experiment Version, type: String, default: v1alpha3, description: 'Experiment version.'}
- {name: Max Trial Count, type: Integer, description: 'How many trials can be executed at most.'}
- {name: Max Failed Trial Count, type: Integer, default: 3, description: 'How many trials can fail at most.'}
- {name: Parallel Trial Count, type: Integer, default: 3, description: 'How many trials can be running in parallel at most.'}
- {name: Objective, type: JSON, description: 'Experiment objective.'}
- {name: Algorithm, type: JSON, description: 'Experiment algorithm.'}
- {name: Trial Template, type: JSON, description: 'Experiment trialTemplate.'}
- {name: Parameters, type: JSON, description: 'Experiment Parameter configuration.'}
- {name: Metrics Collector, type: JSON, default: '{}', description: 'Experiment metricsCollector.'}
- {name: Experiment Timeout Minutes, type: Integer, default: 1440, description: 'Time in minutes to wait for the Experiment to complete.'}
- {name: Delete Finished Experiment, type: Bool, default: 'True', description: 'Whether to delete the experiment after it is finished.'}
outputs:
- {name: Best parameter set, type: JSON, description: 'The parameter set of the best StudyJob trial.'}
- {name: Best Parameter Set, type: JSON, description: 'The parameter set of the best Experiment trial.'}
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-studyjob:2c2445df83fa879387a200747cc20f72a7ee9727
command: [python, /ml/launch_study_job.py]
image: liuhougangxa/katib-experiment-launcher:latest
command: [python, /ml/launch_experiment.py]
args: [
--name, {inputValue: StudyJob name},
--namespace, {inputValue: Namespace},
--optimizationtype, {inputValue: Optimization type},
--objectivevaluename, {inputValue: Objective value name},
--optimizationgoal, {inputValue: Optimization goal},
--requestcount, {inputValue: Request count},
--metricsnames, {inputValue: Metrics names},
--parameterconfigs, {inputValue: Parameter configs},
--nasConfig, {inputValue: NAS config},
--workertemplatepath, {inputValue: Worker template path},
--mcollectortemplatepath, {inputValue: Metrics collector template path},
--suggestionspec, {inputValue: Suggestion spec},
--studyjobtimeoutminutes, {inputValue: StudyJob timeout minutes},
--deleteAfterDone, {inputValue: Delete finished job},
--outputfile, {outputPath: Best parameter set},
--name, {inputValue: Experiment Name},
--namespace, {inputValue: Experiment Namespace},
--version, {inputValue: Experiment Version},
--maxTrialCount, {inputValue: Max Trial Count},
--maxFailedTrialCount, {inputValue: Max Failed Trial Count},
--parallelTrialCount, {inputValue: Parallel Trial Count},
--objectiveConfig, {inputValue: Objective},
--algorithmConfig, {inputValue: Algorithm},
--trialTemplate, {inputValue: Trial Template},
--parameters, {inputValue: Parameters},
--metricsCollector, {inputValue: Metrics Collector},
--experimentTimeoutMinutes, {inputValue: Experiment Timeout Minutes},
--deleteAfterDone, {inputValue: Delete Finished Experiment},
--outputFile, {outputPath: Best Parameter Set},
]
41 changes: 0 additions & 41 deletions components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py

This file was deleted.

114 changes: 114 additions & 0 deletions components/kubeflow/katib-launcher/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import json
import kfp.dsl as dsl

@dsl.pipeline(
name="Launch katib experiment",
description="An example to launch katib experiment."
)
def mnist_hpo(
name="mnist",
namespace="kubeflow",
goal=0.99,
parallelTrialCount=3,
maxTrialCount=12,
experimentTimeoutMinutes=60,
deleteAfterDone=True):
objectiveConfig = {
"type": "maximize",
"goal": goal,
"objectiveMetricName": "Validation-accuracy",
"additionalMetricNames": ["accuracy"]
}
algorithmConfig = {"algorithmName" : "random"}
parameters = [
{"name": "--lr", "parameterType": "double", "feasibleSpace": {"min": "0.01","max": "0.03"}},
{"name": "--num-layers", "parameterType": "int", "feasibleSpace": {"min": "2", "max": "5"}},
{"name": "--optimizer", "parameterType": "categorical", "feasibleSpace": {"list": ["sgd", "adam", "ftrl"]}}
]
rawTemplate = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": "{{.Trial}}",
"namespace": "{{.NameSpace}}"
},
"spec": {
"template": {
"spec": {
"restartPolicy": "Never",
"containers": [
{"name": "{{.Trial}}",
"image": "docker.io/katib/mxnet-mnist-example",
"command": [
"python /mxnet/example/image-classification/train_mnist.py --batch-size=64 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
]
}
]
}
}
}
}
trialTemplate = {
"goTemplate": {
"rawTemplate": json.dumps(rawTemplate)
}
}
op1 = katib_experiment_launcher_op(
name,
namespace,
parallelTrialCount=parallelTrialCount,
maxTrialCount=maxTrialCount,
objectiveConfig=str(objectiveConfig),
algorithmConfig=str(algorithmConfig),
trialTemplate=str(trialTemplate),
parameters=str(parameters),
experimentTimeoutMinutes=experimentTimeoutMinutes,
deleteAfterDone=deleteAfterDone
)

op_out = dsl.ContainerOp(
name="my-out-cop",
image="library/bash:4.4.23",
command=["sh", "-c"],
arguments=["echo hyperparameter: %s" % op1.output],
)


def katib_experiment_launcher_op(
name,
namespace,
maxTrialCount=100,
parallelTrialCount=3,
maxFailedTrialCount=3,
objectiveConfig='{}',
algorithmConfig='{}',
metricsCollector='{}',
trialTemplate='{}',
parameters='[]',
experimentTimeoutMinutes=60,
deleteAfterDone=True,
outputFile='/output.txt'):
return dsl.ContainerOp(
name = "mnist-hpo",
image = 'liuhougangxa/katib-experiment-launcher:latest',
arguments = [
'--name', name,
'--namespace', namespace,
'--maxTrialCount', maxTrialCount,
'--maxFailedTrialCount', maxFailedTrialCount,
'--parallelTrialCount', parallelTrialCount,
'--objectiveConfig', objectiveConfig,
'--algorithmConfig', algorithmConfig,
'--metricsCollector', metricsCollector,
'--trialTemplate', trialTemplate,
'--parameters', parameters,
'--outputFile', outputFile,
'--deleteAfterDone', deleteAfterDone,
'--experimentTimeoutMinutes', experimentTimeoutMinutes,
],
file_outputs = {'bestHyperParameter': outputFile}
)

if __name__ == "__main__":
import kfp.compiler as compiler
compiler.Compiler().compile(mnist_hpo, __file__ + ".tar.gz")
80 changes: 80 additions & 0 deletions components/kubeflow/katib-launcher/sample2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
from kfp import components
import kfp.dsl as dsl

@dsl.pipeline(
name="Launch katib experiment",
description="An example to launch katib experiment."
)
def mnist_hpo(
name="mnist",
namespace="kubeflow",
goal=0.99,
parallelTrialCount=3,
maxTrialCount=12,
experimentTimeoutMinutes=60,
deleteAfterDone=True):
objectiveConfig = {
"type": "maximize",
"goal": goal,
"objectiveMetricName": "Validation-accuracy",
"additionalMetricNames": ["accuracy"]
}
algorithmConfig = {"algorithmName" : "random"}
parameters = [
{"name": "--lr", "parameterType": "double", "feasibleSpace": {"min": "0.01","max": "0.03"}},
{"name": "--num-layers", "parameterType": "int", "feasibleSpace": {"min": "2", "max": "5"}},
{"name": "--optimizer", "parameterType": "categorical", "feasibleSpace": {"list": ["sgd", "adam", "ftrl"]}}
]
rawTemplate = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": "{{.Trial}}",
"namespace": "{{.NameSpace}}"
},
"spec": {
"template": {
"spec": {
"restartPolicy": "Never",
"containers": [
{"name": "{{.Trial}}",
"image": "docker.io/katib/mxnet-mnist-example",
"command": [
"python /mxnet/example/image-classification/train_mnist.py --batch-size=64 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
]
}
]
}
}
}
}
trialTemplate = {
"goTemplate": {
"rawTemplate": json.dumps(rawTemplate)
}
}
katib_experiment_launcher_op = components.load_component_from_file("./component.yaml")
# katib_experiment_launcher_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml')
op1 = katib_experiment_launcher_op(
experiment_name=name,
experiment_namespace=namespace,
parallel_trial_count=parallelTrialCount,
max_trial_count=maxTrialCount,
objective=str(objectiveConfig),
algorithm=str(algorithmConfig),
trial_template=str(trialTemplate),
parameters=str(parameters),
experiment_timeout_minutes=experimentTimeoutMinutes,
delete_finished_experiment=deleteAfterDone)

op_out = dsl.ContainerOp(
name="my-out-cop",
image="library/bash:4.4.23",
command=["sh", "-c"],
arguments=["echo hyperparameter: %s" % op1.output],
)

if __name__ == "__main__":
import kfp.compiler as compiler
compiler.Compiler().compile(mnist_hpo, __file__ + ".tar.gz")
2 changes: 0 additions & 2 deletions components/kubeflow/katib-launcher/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op
49 changes: 0 additions & 49 deletions components/kubeflow/katib-launcher/src/hp.template.yaml

This file was deleted.

Loading