diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml index 6ade7aafb7a..9b400bfa8d9 100644 --- a/.github/workflows/publish-trial-images.yaml +++ b/.github/workflows/publish-trial-images.yaml @@ -32,6 +32,8 @@ jobs: dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile - trial-name: pytorch-mnist dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile + - trial-name: tf-mnist-with-summaries + dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile - trial-name: enas-cnn-cifar10-gpu dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu - trial-name: enas-cnn-cifar10-cpu diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index da883afa2d1..c6e4ac41b76 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -16,24 +16,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.7 - - - name: Install Packages - run: | - pip install -r test/unit/v1beta1/requirements.txt - - pip install -r cmd/suggestion/chocolate/v1beta1/requirements.txt - pip install -r cmd/suggestion/hyperopt/v1beta1/requirements.txt - pip install -r cmd/suggestion/skopt/v1beta1/requirements.txt - pip install -r cmd/suggestion/optuna/v1beta1/requirements.txt - pip install -r cmd/suggestion/nas/enas/v1beta1/requirements.txt - pip install -r cmd/suggestion/hyperband/v1beta1/requirements.txt - pip install -r cmd/suggestion/nas/darts/v1beta1/requirements.txt - - pip install -r cmd/earlystopping/medianstop/v1beta1/requirements.txt + python-version: 3.9 - name: Run Python test - run: | - export PYTHONPATH=$(pwd):$(pwd)/pkg/apis/manager/v1beta1/python:$(pwd)/pkg/apis/manager/health/python - pytest ./test/unit/v1beta1/suggestion - pytest ./test/unit/v1beta1/earlystopping + run: make pytest diff --git a/.gitignore b/.gitignore index d90d0d215c8..6241a98bb08 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ *.egg-info build/ *.charm +test/unit/v1beta1/metricscollector/testdata # SDK generator JAR file hack/gen-python-sdk/openapi-generator-cli.jar diff --git a/Makefile b/Makefile index 3cf60071b70..315aa1b23e7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,12 @@ HAS_LINT := $(shell command -v golangci-lint;) COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD) KATIB_REGISTRY := docker.io/kubeflowkatib +CPU_ARCH ?= amd64 + +# for pytest +PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/apis/manager/v1beta1/python:$(CURDIR)/pkg/apis/manager/health/python +PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/metricscollector/v1beta1/common:$(CURDIR)/pkg/metricscollector/v1beta1/tfevent-metricscollector +TEST_TENSORFLOW_EVENT_FILE_PATH ?= $(CURDIR)/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs # Run tests .PHONY: test @@ -49,10 +55,10 @@ endif # Build images for the Katib v1beta1 components. build: generate -ifeq ($(and $(REGISTRY),$(TAG)),) - $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG=) +ifeq ($(and $(REGISTRY),$(TAG),$(CPU_ARCH)),) + $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG= CPU_ARCH=) endif - bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) + bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) $(CPU_ARCH) # Build and push Katib images from the latest master commit. push-latest: generate @@ -94,3 +100,25 @@ prettier-check: # Update boilerplate for the source code. update-boilerplate: ./hack/boilerplate/update-boilerplate.sh + +prepare-pytest: + pip install -r test/unit/v1beta1/requirements.txt + pip install -r cmd/suggestion/chocolate/v1beta1/requirements.txt + pip install -r cmd/suggestion/hyperopt/v1beta1/requirements.txt + pip install -r cmd/suggestion/skopt/v1beta1/requirements.txt + pip install -r cmd/suggestion/optuna/v1beta1/requirements.txt + pip install -r cmd/suggestion/hyperband/v1beta1/requirements.txt + pip install -r cmd/suggestion/nas/enas/v1beta1/requirements.txt + pip install -r cmd/suggestion/nas/darts/v1beta1/requirements.txt + pip install -r cmd/earlystopping/medianstop/v1beta1/requirements.txt + pip install -r cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt + +prepare-pytest-testdata: +ifeq ("$(wildcard $(TEST_TENSORFLOW_EVENT_FILE_PATH))", "") + python examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py --epochs 5 --batch-size 200 --log-path $(TEST_TENSORFLOW_EVENT_FILE_PATH) +endif + +pytest: prepare-pytest prepare-pytest-testdata + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/earlystopping + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/metricscollector diff --git a/cmd/earlystopping/medianstop/v1beta1/Dockerfile b/cmd/earlystopping/medianstop/v1beta1/Dockerfile index 0054713f12c..6e661aaecf2 100644 --- a/cmd/earlystopping/medianstop/v1beta1/Dockerfile +++ b/cmd/earlystopping/medianstop/v1beta1/Dockerfile @@ -1,12 +1,13 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV EARLY_STOPPING_DIR cmd/earlystopping/medianstop/v1beta1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ - apt-get -y update && \ - apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get -y update && \ + apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi ADD ./pkg/ ${TARGET_DIR}/pkg/ diff --git a/cmd/earlystopping/medianstop/v1beta1/requirements.txt b/cmd/earlystopping/medianstop/v1beta1/requirements.txt index 2f85202c257..78475d4f80d 100644 --- a/cmd/earlystopping/medianstop/v1beta1/requirements.txt +++ b/cmd/earlystopping/medianstop/v1beta1/requirements.txt @@ -1,4 +1,5 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 kubernetes==11.0.0 +cython>=0.29.24 diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile index b5d3c807d23..f94e7be7ca8 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile @@ -1,7 +1,25 @@ -FROM tensorflow/tensorflow:1.11.0 -RUN pip install rfc3339 grpcio googleapis-common-protos -ADD . /usr/src/app/github.com/kubeflow/katib -WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ +FROM python:3.9 + +ENV TARGET_DIR /opt/katib +ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/tfevent-metricscollector +# tensorflow community build for aarch64 +# https://github.com/tensorflow/build#tensorflow-builds +ENV PIP_EXTRA_INDEX_URL https://snapshots.linaro.org/ldcg/python-cache/ + +ADD ./pkg/ ${TARGET_DIR}/pkg/ +ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/ +WORKDIR ${TARGET_DIR}/${METRICS_COLLECTOR_DIR} + +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + pip install tensorflow-aarch64==2.7.0; \ + else \ + pip install tensorflow==2.7.0; \ + fi; RUN pip install --no-cache-dir -r requirements.txt -ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/apis/manager/v1beta1/python:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/tfevent-metricscollector/:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/common/ + +RUN chgrp -R 0 ${TARGET_DIR} \ + && chmod -R g+rwX ${TARGET_DIR} + +ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/tfevent-metricscollector/::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ + ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 deleted file mode 100644 index 44746a40e0b..00000000000 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 +++ /dev/null @@ -1,28 +0,0 @@ -FROM ubuntu:18.04 - -RUN apt-get update \ - && apt-get -y install software-properties-common \ - autoconf \ - automake \ - build-essential \ - cmake \ - pkg-config \ - wget \ - python-pip \ - libhdf5-dev \ - libhdf5-serial-dev \ - hdf5-tools\ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN wget https://github.com/lhelontra/tensorflow-on-arm/releases/download/v1.11.0/tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && pip install tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && rm tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && rm -rf .cache - -RUN pip install rfc3339 grpcio googleapis-common-protos jupyter -ADD . /usr/src/app/github.com/kubeflow/katib -WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ -RUN pip install --no-cache-dir -r requirements.txt -ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/apis/manager/v1beta1/python:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/tfevent-metricscollector/:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/common/ -ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le index b8d2b637607..fbc819dce37 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le @@ -1,5 +1,4 @@ -FROM ibmcom/tensorflow-ppc64le:1.14.0-py3 -RUN pip install rfc3339 grpcio googleapis-common-protos +FROM ibmcom/tensorflow-ppc64le:2.2.0-py3 ADD . /usr/src/app/github.com/kubeflow/katib WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ RUN pip install --no-cache-dir -r requirements.txt diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt index d2ec0c34de0..ab8a014d528 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt @@ -1 +1,4 @@ -psutil==5.6.6 +psutil==5.8.0 +rfc3339>=6.2 +grpcio==1.41.1 +googleapis-common-protos==1.6.0 diff --git a/cmd/suggestion/chocolate/v1beta1/Dockerfile b/cmd/suggestion/chocolate/v1beta1/Dockerfile index 52bb736fd24..407f8a6852b 100644 --- a/cmd/suggestion/chocolate/v1beta1/Dockerfile +++ b/cmd/suggestion/chocolate/v1beta1/Dockerfile @@ -1,15 +1,16 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/chocolate/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython 'numpy>=1.13.3'; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ @@ -21,6 +22,9 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + sed -i -e '$a git+https://github.com/fmder/ghalton@master' -e '/^ghalton/d' requirements.txt; \ + fi; RUN pip install --no-cache-dir -r requirements.txt RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/cmd/suggestion/chocolate/v1beta1/requirements.txt b/cmd/suggestion/chocolate/v1beta1/requirements.txt index 1b72ac01900..8466aa7d7c4 100644 --- a/cmd/suggestion/chocolate/v1beta1/requirements.txt +++ b/cmd/suggestion/chocolate/v1beta1/requirements.txt @@ -1,11 +1,12 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -SQLAlchemy==1.3.8 +SQLAlchemy==1.4.26 git+https://github.com/AIworx-Labs/chocolate@master -ghalton>=0.6 +ghalton>=0.6.2 +cython>=0.29.24 diff --git a/cmd/suggestion/goptuna/v1beta1/Dockerfile b/cmd/suggestion/goptuna/v1beta1/Dockerfile index aad3a699205..5f3040622ec 100644 --- a/cmd/suggestion/goptuna/v1beta1/Dockerfile +++ b/cmd/suggestion/goptuna/v1beta1/Dockerfile @@ -1,6 +1,8 @@ # Build the Goptuna Suggestion. FROM golang:alpine AS build-env +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 + WORKDIR /go/src/github.com/kubeflow/katib # Download packages. @@ -22,8 +24,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \ fi # Add GRPC health probe. -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperband/v1beta1/Dockerfile b/cmd/suggestion/hyperband/v1beta1/Dockerfile index 58f92d842fd..c1c1991044f 100644 --- a/cmd/suggestion/hyperband/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperband/v1beta1/Dockerfile @@ -1,16 +1,17 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/hyperband/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperband/v1beta1/requirements.txt b/cmd/suggestion/hyperband/v1beta1/requirements.txt index 6677b67f90e..0b0b7450685 100644 --- a/cmd/suggestion/hyperband/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperband/v1beta1/requirements.txt @@ -1,8 +1,9 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 +cython>=0.29.24 diff --git a/cmd/suggestion/hyperopt/v1beta1/Dockerfile b/cmd/suggestion/hyperopt/v1beta1/Dockerfile index 2c1d227b160..e23d58d7538 100644 --- a/cmd/suggestion/hyperopt/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperopt/v1beta1/Dockerfile @@ -1,16 +1,17 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/hyperopt/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperopt/v1beta1/requirements.txt b/cmd/suggestion/hyperopt/v1beta1/requirements.txt index a0a8fb20e4c..f34047827b5 100644 --- a/cmd/suggestion/hyperopt/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperopt/v1beta1/requirements.txt @@ -1,9 +1,10 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -hyperopt==0.2.3 +hyperopt==0.2.5 +cython>=0.29.24 diff --git a/cmd/suggestion/nas/darts/v1beta1/Dockerfile b/cmd/suggestion/nas/darts/v1beta1/Dockerfile index d95a12dad0b..318fc4af86e 100644 --- a/cmd/suggestion/nas/darts/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/darts/v1beta1/Dockerfile @@ -1,16 +1,17 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/nas/darts/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ @@ -30,4 +31,3 @@ RUN chgrp -R 0 ${TARGET_DIR} \ ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/apis/manager/health/python ENTRYPOINT ["python", "main.py"] - diff --git a/cmd/suggestion/nas/darts/v1beta1/requirements.txt b/cmd/suggestion/nas/darts/v1beta1/requirements.txt index 92bd5706e11..f5b413a47d4 100644 --- a/cmd/suggestion/nas/darts/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/darts/v1beta1/requirements.txt @@ -1,3 +1,4 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 +cython>=0.29.24 diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile b/cmd/suggestion/nas/enas/v1beta1/Dockerfile index c5a77c87091..2584138f766 100644 --- a/cmd/suggestion/nas/enas/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/enas/v1beta1/Dockerfile @@ -1,16 +1,23 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/nas/enas/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 +# tensorflow community build for aarch64 +# https://github.com/tensorflow/build#tensorflow-builds +ENV PIP_EXTRA_INDEX_URL https://snapshots.linaro.org/ldcg/python-cache/ -RUN if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ + +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ else \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64; \ fi && \ @@ -19,6 +26,10 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} + +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + sed -i 's/tensorflow==/tensorflow-aarch64==/' requirements.txt; \ + fi; RUN pip install --no-cache-dir -r requirements.txt RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 b/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 deleted file mode 100644 index 045bc1a1c8e..00000000000 --- a/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 +++ /dev/null @@ -1,58 +0,0 @@ -FROM golang:alpine AS build-env -# The GOPATH in the image is /go. -ADD . /go/src/github.com/kubeflow/katib -RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ - apk --update add git gcc musl-dev && \ - go get github.com/grpc-ecosystem/grpc-health-probe && \ - mv $GOPATH/bin/grpc-health-probe /bin/grpc_health_probe && \ - chmod +x /bin/grpc_health_probe; \ - else \ - GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \ - chmod +x /bin/grpc_health_probe; \ - fi - -FROM python:3.7-slim-buster - -ENV TARGET_DIR /opt/katib -ENV SUGGESTION_DIR cmd/suggestion/nas/enas/v1beta1 - -RUN apt-get update \ - && apt-get -y install software-properties-common \ - autoconf \ - automake \ - build-essential \ - cmake \ - libtool \ - pkg-config \ - wget \ - gfortran \ - libopenblas-dev \ - liblapack-dev \ - libhdf5-dev \ - libhdf5-serial-dev \ - hdf5-tools \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install cython numpy - -RUN wget https://github.com/lhelontra/tensorflow-on-arm/releases/download/v1.14.0-buster/tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && pip install tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && rm tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && rm -rf .cache - -RUN pip install 'grpcio==1.23.0' 'protobuf==3.9.1' 'googleapis-common-protos==1.6.0' - -COPY --from=build-env /bin/grpc_health_probe /bin/ - -ADD ./pkg/ ${TARGET_DIR}/pkg/ -ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ -WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} - -RUN chgrp -R 0 ${TARGET_DIR} \ - && chmod -R g+rwX ${TARGET_DIR} - -ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/apis/manager/health/python - -ENTRYPOINT ["python", "main.py"] diff --git a/cmd/suggestion/nas/enas/v1beta1/requirements.txt b/cmd/suggestion/nas/enas/v1beta1/requirements.txt index bde25645c41..87d839b0ece 100644 --- a/cmd/suggestion/nas/enas/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/enas/v1beta1/requirements.txt @@ -1,4 +1,5 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -tensorflow==1.15.4 +tensorflow==2.7.0 +cython>=0.29.24 diff --git a/cmd/suggestion/optuna/v1beta1/Dockerfile b/cmd/suggestion/optuna/v1beta1/Dockerfile index bd7e43ecaf2..274bc94a6d6 100644 --- a/cmd/suggestion/optuna/v1beta1/Dockerfile +++ b/cmd/suggestion/optuna/v1beta1/Dockerfile @@ -2,14 +2,15 @@ FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/optuna/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ - apt-get -y install gfortran libopenblas-dev liblapack-dev; \ + apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi - -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/optuna/v1beta1/requirements.txt b/cmd/suggestion/optuna/v1beta1/requirements.txt index 09b0692fa06..5e35741485d 100644 --- a/cmd/suggestion/optuna/v1beta1/requirements.txt +++ b/cmd/suggestion/optuna/v1beta1/requirements.txt @@ -1,4 +1,4 @@ -grpcio==1.39.0 -protobuf==3.17.3 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.53.0 -optuna>=2.8.0 \ No newline at end of file +optuna>=2.8.0 diff --git a/cmd/suggestion/skopt/v1beta1/Dockerfile b/cmd/suggestion/skopt/v1beta1/Dockerfile index 2962715e53b..b71347ca3a7 100644 --- a/cmd/suggestion/skopt/v1beta1/Dockerfile +++ b/cmd/suggestion/skopt/v1beta1/Dockerfile @@ -1,16 +1,16 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/skopt/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi - -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/skopt/v1beta1/requirements.txt b/cmd/suggestion/skopt/v1beta1/requirements.txt index 3734706b97d..2cd9502447d 100644 --- a/cmd/suggestion/skopt/v1beta1/requirements.txt +++ b/cmd/suggestion/skopt/v1beta1/requirements.txt @@ -1,9 +1,10 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn==0.22.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -scikit-optimize==0.5.2 +scikit-optimize>=0.9.0 +cython>=0.29.24 diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 052fee5e79f..c5d8a375bc3 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -13,9 +13,9 @@ see the following user guides: ## Requirements - [Go](https://golang.org/) (1.17 or later) -- [Docker](https://docs.docker.com/) (17.05 or later) +- [Docker](https://docs.docker.com/) (20.10 or later) - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later) -- [Python](https://www.python.org/) (3.7 or later) +- [Python](https://www.python.org/) (3.9 or later) - [kustomize](https://kustomize.io/) (4.0.5 or later) ## Build from source code diff --git a/docs/images-location.md b/docs/images-location.md index f96e8ffc1b5..d6c9eb82ea8 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -284,13 +284,13 @@ The following table shows images for training containers which are used in the - gcr.io/kubeflow-ci/tf-mnist-with-summaries + docker.io/kubeflowkatib/tf-mnist-with-summaries Tensorflow MNIST example with saving metrics in the summaries - Dockerfile + Dockerfile diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index 2b3a6804f45..11e3508efa3 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -100,6 +100,8 @@ Check the following examples: Check the following images for the Trial containers: +- [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries) + - [MXNet MNIST](./trial-images/mxnet-mnist) - [PyTorch MNIST](./trial-images/pytorch-mnist) diff --git a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml index f68668da657..90c3dc81a2e 100644 --- a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml +++ b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml @@ -10,13 +10,13 @@ spec: objective: type: maximize goal: 0.99 - objectiveMetricName: accuracy_1 + objectiveMetricName: accuracy algorithm: algorithmName: random metricsCollectorSpec: source: fileSystemPath: - path: /train + path: /mnist-with-summaries-logs/test kind: Directory collector: kind: TensorFlowEvent @@ -29,8 +29,8 @@ spec: - name: batch_size parameterType: int feasibleSpace: - min: "100" - max: "200" + min: "10" + max: "20" trialTemplate: primaryContainerName: tensorflow trialParameters: @@ -52,10 +52,11 @@ spec: spec: containers: - name: tensorflow - image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 + image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest command: - "python" - - "/var/tf_mnist/mnist_with_summaries.py" - - "--log_dir=/train/metrics" - - "--learning_rate=${trialParameters.learningRate}" - - "--batch_size=${trialParameters.batchSize}" + - "/opt/tf-mnist-with-summaries/mnist.py" + - "--epochs=1" + - "--learning-rate=${trialParameters.learningRate}" + - "--batch-size=${trialParameters.batchSize}" + - "--log-path=/mnist-with-summaries-logs" diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu index 3710a59f8fc..30af4f77020 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu @@ -1,12 +1,11 @@ -FROM tensorflow/tensorflow:1.15.4-py3 +FROM tensorflow/tensorflow:2.7.0 ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu index 5020d01ad36..316ddf8a8fe 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu @@ -1,12 +1,10 @@ -FROM tensorflow/tensorflow:1.15.4-gpu-py3 +FROM tensorflow/tensorflow:2.7.0-gpu ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py index 4672e079a27..4b5b8ab327e 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py @@ -1,12 +1,10 @@ -import keras -import numpy as np +from tensorflow import keras from keras.datasets import cifar10 from ModelConstructor import ModelConstructor -from keras.utils import to_categorical -from keras.utils import multi_gpu_model +from tensorflow.keras.utils import to_categorical +from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from keras.preprocessing.image import ImageDataGenerator import argparse -import time if __name__ == "__main__": parser = argparse.ArgumentParser(description='TrainingContainer') @@ -46,7 +44,7 @@ test_model.summary() test_model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adam(lr=1e-3, decay=1e-4), + optimizer=keras.optimizers.Adam(learning_rate=1e-3, decay=1e-4), metrics=['accuracy']) (x_train, y_train), (x_test, y_test) = cifar10.load_data() @@ -67,12 +65,12 @@ print(">>> Data Loaded. Training starts.") for e in range(num_epochs): - print("\nTotal Epoch {}/{}".format(e+1, num_epochs)) - history = test_model.fit_generator(generator=aug_data_flow, - steps_per_epoch=int(len(x_train)/128)+1, - epochs=1, verbose=1, - validation_data=(x_test, y_test)) - print("Training-Accuracy={}".format(history.history['acc'][-1])) + print("\nTotal Epoch {}/{}".format(e + 1, num_epochs)) + history = test_model.fit(aug_data_flow, + steps_per_epoch=int(len(x_train) / 128) + 1, + epochs=1, verbose=1, + validation_data=(x_test, y_test)) + print("Training-Accuracy={}".format(history.history['accuracy'][-1])) print("Training-Loss={}".format(history.history['loss'][-1])) - print("Validation-Accuracy={}".format(history.history['val_acc'][-1])) + print("Validation-Accuracy={}".format(history.history['val_accuracy'][-1])) print("Validation-Loss={}".format(history.history['val_loss'][-1])) diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt index 1a23c027782..497c40a9811 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt @@ -1 +1 @@ -keras==2.2.4 +scipy>=1.7.2 diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile new file mode 100644 index 00000000000..e54e4c80698 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile @@ -0,0 +1,9 @@ +FROM tensorflow/tensorflow:2.7.0 + +ADD examples/v1beta1/trial-images/tf-mnist-with-summaries /opt/tf-mnist-with-summaries +WORKDIR /opt/tf-mnist-with-summaries + +RUN chgrp -R 0 /opt/tf-mnist-with-summaries \ + && chmod -R g+rwX /opt/tf-mnist-with-summaries + +ENTRYPOINT ["python3", "/opt/tf-mnist-with-summaries/mnist.py"] diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md new file mode 100644 index 00000000000..8f8fb4e5182 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md @@ -0,0 +1,11 @@ +# Tensorflow MNIST Classification With Summaries Example + +This is Tensorflow MNIST image classification training container that outputs TF summaries. +It uses convolutional neural network to train the model. + +If you want to read more about this example, visit the official +[tensorflow](https://www.tensorflow.org/tutorials/quickstart/advanced) +documentation. + +Katib uses this training container in some Experiments, for instance in the +[TFJob example](../../kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L54-L62). diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py new file mode 100644 index 00000000000..9795aef1e92 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py @@ -0,0 +1,137 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import tensorflow as tf + +from tensorflow.keras.layers import Dense, Flatten, Conv2D +from tensorflow.keras import Model + + +class MyModel(Model): + def __init__(self): + super(MyModel, self).__init__() + self.conv1 = Conv2D(32, 3, activation='relu') + self.flatten = Flatten() + self.d1 = Dense(128, activation='relu') + self.d2 = Dense(10) + + def call(self, x): + x = self.conv1(x) + x = self.flatten(x) + x = self.d1(x) + return self.d2(x) + + +def train_step(args, model, optimizer, train_ds, epoch, loss_object, train_summary_writer, train_loss, train_accuracy): + for step, (images, labels) in enumerate(train_ds): + with tf.GradientTape() as tape: + # training=True is only needed if there are layers with different + # behavior during training versus inference (e.g. Dropout). + predictions = model(images, training=True) + loss = loss_object(labels, predictions) + gradients = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(gradients, model.trainable_variables)) + + train_loss(loss) + train_accuracy(labels, predictions) + + if step % args.log_interval == 0: + print("Train Epoch: {} [{}/60000 ({:.0f}%)]\tloss={:.4f}, accuracy={:.4f}".format( + epoch + 1, step * args.batch_size, 100. * step * args.batch_size / 60000, + train_loss.result(), train_accuracy.result() * 100) + ) + + with train_summary_writer.as_default(): + tf.summary.scalar('loss', train_loss.result(), step=epoch) + tf.summary.scalar('accuracy', train_accuracy.result(), step=epoch) + + +def test_step(model, test_ds, epoch, loss_object, test_summary_writer, test_loss, test_accuracy): + for (images, labels) in test_ds: + # training=False is only needed if there are layers with different + # behavior during training versus inference (e.g. Dropout). + predictions = model(images, training=False) + t_loss = loss_object(labels, predictions) + + test_loss(t_loss) + test_accuracy(labels, predictions) + + with test_summary_writer.as_default(): + tf.summary.scalar('loss', test_loss.result(), step=epoch) + tf.summary.scalar('accuracy', test_accuracy.result(), step=epoch) + + print("Test Loss: {:.4f}, Test Accuracy: {:.4f}\n".format( + test_loss.result(), test_accuracy.result() * 100) + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--batch-size', type=int, default=64, + help='input batch size for training (default: 64)') + parser.add_argument('--learning-rate', type=float, default=0.001, + help='learning rate (default: 0.001)') + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--log-interval", type=int, default=100, metavar="N", + help="how many batches to wait before logging training status (default: 100)") + parser.add_argument( + '--log-path', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/logs/mnist_with_summaries'), + help='Summaries log PATH') + args = parser.parse_args() + + # Setup dataset + mnist = tf.keras.datasets.mnist + (x_train, y_train), (x_test, y_test) = mnist.load_data() + x_train, x_test = x_train / 255.0, x_test / 255.0 + # Add a channels dimension + x_train = x_train[..., tf.newaxis].astype("float32") + x_test = x_test[..., tf.newaxis].astype("float32") + train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(args.batch_size) + test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(args.batch_size) + + # Setup tensorflow summaries + train_log_dir = os.path.join(args.log_path, 'train') + test_log_dir = os.path.join(args.log_path, 'test') + train_summary_writer = tf.summary.create_file_writer(train_log_dir) + test_summary_writer = tf.summary.create_file_writer(test_log_dir) + + # Create an instance of the model + model = MyModel() + loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate) + + train_loss = tf.keras.metrics.Mean(name='train_loss') + train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') + + test_loss = tf.keras.metrics.Mean(name='test_loss') + test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') + + for epoch in range(args.epochs): + # Reset the metrics at the start of the next epoch + train_summary_writer.flush() + test_summary_writer.flush() + + train_step(args, model, optimizer, train_ds, epoch, loss_object, train_summary_writer, + train_loss, train_accuracy) + test_step(model, test_ds, epoch, loss_object, test_summary_writer, test_loss, test_accuracy) + + +if __name__ == "__main__": + main() diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 5018fc82237..5377f854d62 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -14,48 +14,55 @@ # TFEventFileParser parses tfevent files and returns an ObservationLog of the metrics specified. # When the event file is under a directory(e.g. test dir), please specify "{{dirname}}/{{metrics name}}" -# For example, in the Kubeflow Training Operator TFJob tutorial for mnist with summary: -# https://github.com/kubeflow/training-operator/blob/master/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py. -# The "accuracy" metric is saved under "train" and "test" directories. +# For example, in the Tensorflow MNIST Classification With Summaries: +# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py. +# The "accuracy" and "loss" metric is saved under "train" and "test" directories. # So in the Metrics Collector specification, please specify name of "train" or "test" directory. # Check TFJob example for more information: # https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22 - import tensorflow as tf +from tensorboard.backend.event_processing.event_accumulator import EventAccumulator import os from datetime import datetime import rfc3339 import api_pb2 from logging import getLogger, StreamHandler, INFO -import const +from pkg.metricscollector.v1beta1.common import const class TFEventFileParser: - def find_all_files(self, directory): + def __init__(self, metric_names): + self.metric_names = metric_names + + @staticmethod + def find_all_files(directory): for root, dirs, files in os.walk(directory): - yield root for f in files: yield os.path.join(root, f) - def parse_summary(self, tfefile, metrics): + def parse_summary(self, tfefile): metric_logs = [] - for summary in tf.train.summary_iterator(tfefile): - paths = tfefile.split("/") - for v in summary.summary.value: - for m in metrics: - tag = str(v.tag) - if len(paths) >= 2 and len(m.split("/")) >= 2: - tag = str(paths[-2]+"/" + v.tag) - if tag.startswith(m): - ml = api_pb2.MetricLog( - time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(summary.wall_time)), - metric=api_pb2.Metric( - name=m, - value=str(v.simple_value) - ) + event_accumulator = EventAccumulator(tfefile, size_guidance={'tensors': 0}) + event_accumulator.Reload() + for tag in event_accumulator.Tags()['tensors']: + for m in self.metric_names: + + tfefile_parent_dir = os.path.dirname(m) if len(m.split("/")) >= 2 else os.path.dirname(tfefile) + basedir_name = os.path.dirname(tfefile) + if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith(tfefile_parent_dir): + continue + + for wall_time, step, tensor in event_accumulator.Tensors(tag): + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(wall_time)), + metric=api_pb2.Metric( + name=m, + value=str(tf.make_ndarray(tensor)) ) - metric_logs.append(ml) + ) + metric_logs.append(ml) + return metric_logs @@ -68,7 +75,7 @@ def __init__(self, metric_names): self.logger.addHandler(handler) self.logger.propagate = False self.metrics = metric_names - self.parser = TFEventFileParser() + self.parser = TFEventFileParser(self.metrics) def parse_file(self, directory): mls = [] @@ -77,7 +84,7 @@ def parse_file(self, directory): continue try: self.logger.info(f + " will be parsed.") - mls.extend(self.parser.parse_summary(f, self.metrics)) + mls.extend(self.parser.parse_summary(f)) except Exception as e: self.logger.warning("Unexpected error: " + str(e)) continue diff --git a/pkg/suggestion/v1beta1/nas/enas/Controller.py b/pkg/suggestion/v1beta1/nas/enas/Controller.py old mode 100755 new mode 100644 index c3f231d8045..11d31b038de --- a/pkg/suggestion/v1beta1/nas/enas/Controller.py +++ b/pkg/suggestion/v1beta1/nas/enas/Controller.py @@ -54,7 +54,7 @@ def __init__(self, def _build_params(self): """Create TF parameters""" self.logger.info(">>> Building Controller Parameters\n") - initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) + initializer = tf.compat.v1.random_uniform_initializer(minval=-0.01, maxval=0.01) hidden_size = self.controller_hidden_size with tf.compat.v1.variable_scope(self.controller_name, initializer=initializer): @@ -127,7 +127,7 @@ def _build_sampler(self): entropy = log_prob * tf.exp(-log_prob) entropy = tf.stop_gradient(entropy) sample_entropies.append(entropy) - inputs = tf.nn.embedding_lookup(self.w_emb, func) + inputs = tf.nn.embedding_lookup(params=self.w_emb, ids=func) next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h @@ -154,26 +154,26 @@ def _build_sampler(self): skip_prob = tf.sigmoid(logits) kl = skip_prob * tf.math.log(skip_prob/skip_targets) - kl = tf.reduce_sum(kl) + kl = tf.reduce_sum(input_tensor=kl) skip_penalties.append(kl) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=skip_index) - sample_log_probs.append(tf.reduce_sum(log_prob, keepdims=True)) + sample_log_probs.append(tf.reduce_sum(input_tensor=log_prob, keepdims=True)) entropy = tf.stop_gradient( - tf.reduce_sum(log_prob * tf.exp(-log_prob), keepdims=True)) + tf.reduce_sum(input_tensor=log_prob * tf.exp(-log_prob), keepdims=True)) sample_entropies.append(entropy) skip_index = tf.dtypes.cast(skip_index, tf.float32) skip_index = tf.reshape(skip_index, [1, layer_id]) - skip_count.append(tf.reduce_sum(skip_index)) + skip_count.append(tf.reduce_sum(input_tensor=skip_index)) inputs = tf.matmul(skip_index, tf.concat(all_h, axis=0)) - inputs /= (1.0 + tf.reduce_sum(skip_index)) + inputs /= (1.0 + tf.reduce_sum(input_tensor=skip_index)) else: inputs = self.g_emb @@ -184,16 +184,16 @@ def _build_sampler(self): self.sample_arc = tf.reshape(arc_seq, [-1]) sample_entropies = tf.stack(sample_entropies) - self.sample_entropy = tf.reduce_sum(sample_entropies) + self.sample_entropy = tf.reduce_sum(input_tensor=sample_entropies) sample_log_probs = tf.stack(sample_log_probs, axis=0) - self.sample_log_probs = tf.reduce_sum(sample_log_probs) + self.sample_log_probs = tf.reduce_sum(input_tensor=sample_log_probs) skip_penalties = tf.stack(skip_penalties) - self.skip_penalties = tf.reduce_mean(skip_penalties) + self.skip_penalties = tf.reduce_mean(input_tensor=skip_penalties) skip_count = tf.stack(skip_count) - self.skip_count = tf.reduce_sum(skip_count) + self.skip_count = tf.reduce_sum(input_tensor=skip_count) def build_trainer(self): """Build the train ops by connecting Controller with candidate.""" @@ -207,7 +207,7 @@ def build_trainer(self): if self.controller_entropy_weight is not None: self.reward += self.controller_entropy_weight * self.sample_entropy - self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) + self.sample_log_probs = tf.reduce_sum(input_tensor=self.sample_log_probs) self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) baseline_update = tf.compat.v1.assign_sub( self.baseline, (1 - self.controller_baseline_decay) * (self.baseline - self.reward)) @@ -249,7 +249,7 @@ def _lstm(x, prev_c, prev_h, w_lstm): def _build_train_op(loss, tf_variables, train_step, learning_rate): """Build training ops from `loss` tensor.""" optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) - grads = tf.gradients(loss, tf_variables) + grads = tf.gradients(ys=loss, xs=tf_variables) grad_norm = tf.linalg.global_norm(grads) train_op = optimizer.apply_gradients(zip(grads, tf_variables), global_step=train_step) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index abc020f1286..f657843eae0 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -21,103 +21,120 @@ set -e REGISTRY=$1 TAG=$2 +ARCH=$3 -if [[ -z "$REGISTRY" || -z "$TAG" ]]; then - echo "Image registry and tag must be set" - echo "Usage: $0 " 1>&2 +if [[ -z "$REGISTRY" || -z "$TAG" || -z "$ARCH" ]]; then + echo "Image registry, tag and cpu-architecture must be set" + echo "Usage: $0 " 1>&2 exit 1 fi +SUPPORTED_CPU_ARCHS=(amd64 arm64 ppc64le) +function check_specified_cpu_arch() { + for SUPPORTED_ARCH in "${SUPPORTED_CPU_ARCHS[@]}"; do \ + if [ "$ARCH" = "$SUPPORTED_ARCH" ]; then \ + return 0 + fi; + done + echo "CPU architecture '$ARCH' is not supported" + echo "You can use '${SUPPORTED_CPU_ARCHS[*]}'" + echo "To get machine architecture run: uname -m" + return 1 +} +check_specified_cpu_arch + VERSION="v1beta1" CMD_PREFIX="cmd" -MACHINE_ARCH=$(uname -m) echo "Building images for Katib ${VERSION}..." echo "Image registry: ${REGISTRY}" echo "Image tag: ${TAG}" -SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/../.. -cd ${SCRIPT_ROOT} +SCRIPT_ROOT=$(dirname "$0")/../.. +cd "${SCRIPT_ROOT}" # Katib core images echo -e "\nBuilding Katib controller image...\n" -docker build -t ${REGISTRY}/katib-controller:${TAG} -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-controller:${TAG}" -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . echo -e "\nBuilding Katib DB manager image...\n" -docker build -t ${REGISTRY}/katib-db-manager:${TAG} -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . # TODO (andreyvelich): Switch to ${CMD_PREFIX}/ui/${VERSION}/Dockerfile once old UI is deprecated. echo -e "\nBuilding Katib UI image...\n" -docker build -t ${REGISTRY}/katib-ui:${TAG} -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . echo -e "\nBuilding Katib cert generator image...\n" -docker build -t ${REGISTRY}/cert-generator:${TAG} -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . echo -e "\nBuilding file metrics collector image...\n" -docker build -t ${REGISTRY}/file-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" -if [ $MACHINE_ARCH == "aarch64" ]; then - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.aarch64 . -elif [ $MACHINE_ARCH == "ppc64le" ]; then - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . -else - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . +if [ "$ARCH" == "ppc64le" ]; then + docker build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . +else \ + docker build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . fi # Suggestion images echo -e "\nBuilding suggestion images..." echo -e "\nBuilding hyperopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperopt:${TAG} -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperopt:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . echo -e "\nBuilding chocolate suggestion...\n" -docker build -t ${REGISTRY}/suggestion-chocolate:${TAG} -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-chocolate:${TAG}" -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . echo -e "\nBuilding hyperband suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperband:${TAG} -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperband:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . echo -e "\nBuilding skopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-skopt:${TAG} -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-skopt:${TAG}" -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . echo -e "\nBuilding goptuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-goptuna:${TAG} -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-goptuna:${TAG}" -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . echo -e "\nBuilding optuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-optuna:${TAG} -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-optuna:${TAG}" -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . echo -e "\nBuilding ENAS suggestion...\n" -if [ $MACHINE_ARCH == "aarch64" ]; then - docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile.aarch64 . -else - docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . -fi +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-enas:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . echo -e "\nBuilding DARTS suggestion...\n" -docker build -t ${REGISTRY}/suggestion-darts:${TAG} -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-darts:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . # Early stopping images echo -e "\nBuilding early stopping images...\n" echo -e "\nBuilding median stopping rule...\n" -docker build -t ${REGISTRY}/earlystopping-medianstop:${TAG} -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/earlystopping-medianstop:${TAG}" -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . # Training container images -echo -e "\nBuilding training container images..." +if [ ! "$ARCH" = "amd64" ]; then \ + echo -e "\nTraining container images are supported only amd64." +else \ + + echo -e "\nBuilding training container images..." -echo -e "\nBuilding mxnet mnist training container example...\n" -docker build -t ${REGISTRY}/mxnet-mnist:${TAG} -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . + echo -e "\nBuilding mxnet mnist training container example...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . -echo -e "\nBuilding PyTorch mnist training container example...\n" -docker build -t ${REGISTRY}/pytorch-mnist:${TAG} -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . + echo -e "\nBuilding Tensorflow with summaries mnist training container example...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/tf-mnist-with-summaries:${TAG}" -f examples/${VERSION}/trial-images/tf-mnist-with-summaries/Dockerfile . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . + echo -e "\nBuilding PyTorch mnist training container example...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . -echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" -docker build -t ${REGISTRY}/darts-cnn-cifar10:${TAG} -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . + + echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/darts-cnn-cifar10:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + +fi echo -e "\nAll Katib images with ${TAG} tag have been built successfully!\n" diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 321e3ad76ee..7f906399b1b 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -36,72 +36,75 @@ echo "Image tag: ${TAG}" # Katib core images echo -e "\nPushing Katib controller image...\n" -docker push ${REGISTRY}/katib-controller:${TAG} +docker push "${REGISTRY}/katib-controller:${TAG}" echo -e "\nPushing Katib DB manager image...\n" -docker push ${REGISTRY}/katib-db-manager:${TAG} +docker push "${REGISTRY}/katib-db-manager:${TAG}" echo -e "\nPushing Katib UI image...\n" -docker push ${REGISTRY}/katib-ui:${TAG} +docker push "${REGISTRY}/katib-ui:${TAG}" echo -e "\nPushing Katib cert generator image...\n" -docker push ${REGISTRY}/cert-generator:${TAG} +docker push "${REGISTRY}/cert-generator:${TAG}" echo -e "\nPushing file metrics collector image...\n" -docker push ${REGISTRY}/file-metrics-collector:${TAG} +docker push "${REGISTRY}/file-metrics-collector:${TAG}" echo -e "\nPushing TF Event metrics collector image...\n" -docker push ${REGISTRY}/tfevent-metrics-collector:${TAG} +docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}" # Suggestion images echo -e "\nPushing suggestion images..." echo -e "\nPushing hyperopt suggestion...\n" -docker push ${REGISTRY}/suggestion-hyperopt:${TAG} +docker push "${REGISTRY}/suggestion-hyperopt:${TAG}" echo -e "\nPushing chocolate suggestion...\n" -docker push ${REGISTRY}/suggestion-chocolate:${TAG} +docker push "${REGISTRY}/suggestion-chocolate:${TAG}" echo -e "\nPushing hyperband suggestion...\n" -docker push ${REGISTRY}/suggestion-hyperband:${TAG} +docker push "${REGISTRY}/suggestion-hyperband:${TAG}" echo -e "\nPushing skopt suggestion...\n" -docker push ${REGISTRY}/suggestion-skopt:${TAG} +docker push "${REGISTRY}/suggestion-skopt:${TAG}" echo -e "\nPushing goptuna suggestion...\n" -docker push ${REGISTRY}/suggestion-goptuna:${TAG} +docker push "${REGISTRY}/suggestion-goptuna:${TAG}" echo -e "\nPushing optuna suggestion...\n" -docker push ${REGISTRY}/suggestion-optuna:${TAG} +docker push "${REGISTRY}/suggestion-optuna:${TAG}" echo -e "\nPushing ENAS suggestion...\n" -docker push ${REGISTRY}/suggestion-enas:${TAG} +docker push "${REGISTRY}/suggestion-enas:${TAG}" echo -e "\nPushing DARTS suggestion...\n" -docker push ${REGISTRY}/suggestion-darts:${TAG} +docker push "${REGISTRY}/suggestion-darts:${TAG}" # Early stopping images echo -e "\nPushing early stopping images...\n" echo -e "\nPushing median stopping rule...\n" -docker push ${REGISTRY}/earlystopping-medianstop:${TAG} +docker push "${REGISTRY}/earlystopping-medianstop:${TAG}" # Training container images echo -e "\nPushing training container images..." echo -e "\nPushing mxnet mnist training container example...\n" -docker push ${REGISTRY}/mxnet-mnist:${TAG} +docker push "${REGISTRY}/mxnet-mnist:${TAG}" + +echo -e "\nPushing Tensorflow with summaries mnist training container example...\n" +docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}" echo -e "\nPushing PyTorch mnist training container example...\n" -docker push ${REGISTRY}/pytorch-mnist:${TAG} +docker push "${REGISTRY}/pytorch-mnist:${TAG}" echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -docker push ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} +docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -docker push ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} +docker push "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" echo -e "\nPushing PyTorch CIFAR-10 CNN training container example for DARTS...\n" -docker push ${REGISTRY}/darts-cnn-cifar10:${TAG} +docker push "${REGISTRY}/darts-cnn-cifar10:${TAG}" echo -e "\nAll Katib images with ${TAG} tag have been pushed successfully!\n" diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh index ed6d6c22d61..e9a6ea55f98 100755 --- a/scripts/v1beta1/update-images.sh +++ b/scripts/v1beta1/update-images.sh @@ -80,6 +80,7 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\"" # Postfixes for the each Trial image. MXNET_MNIST="mxnet-mnist" PYTORCH_MNIST="pytorch-mnist" +TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries" ENAS_GPU="enas-cnn-cifar10-gpu" ENAS_CPU="enas-cnn-cifar10-cpu" DARTS="darts-cnn-cifar10" @@ -87,6 +88,7 @@ DARTS="darts-cnn-cifar10" echo -e "Update Katib Trial training container images\n" update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}" +update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${DARTS}:.*" "${NEW_PREFIX}${DARTS}:${TAG}" diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 0345c262c9a..097515c14ef 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -41,27 +41,28 @@ # Dict with all Katib images. # Key - image name, Value - dockerfile location. KATIB_IMAGES = { - "katib-controller": "cmd/katib-controller/v1beta1/Dockerfile", - "katib-db-manager": "cmd/db-manager/v1beta1/Dockerfile", + "katib-controller": "cmd/katib-controller/v1beta1/Dockerfile", + "katib-db-manager": "cmd/db-manager/v1beta1/Dockerfile", # TODO (andreyvelich): Change it to /cmd/ui/v1beta1/Dockerfile once old UI is deprecated. - "katib-ui": "cmd/new-ui/v1beta1/Dockerfile", - "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", - "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", - "tfevent-metrics-collector": "/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", - "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", - "suggestion-chocolate": "cmd/suggestion/chocolate/v1beta1/Dockerfile", - "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", - "suggestion-hyperband": "cmd/suggestion/hyperband/v1beta1/Dockerfile", - "suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile", - "suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile", - "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", - "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", - "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", - "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", - "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", - "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", - "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", - "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", + "katib-ui": "cmd/new-ui/v1beta1/Dockerfile", + "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", + "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", + "tfevent-metrics-collector": "cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", + "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", + "suggestion-chocolate": "cmd/suggestion/chocolate/v1beta1/Dockerfile", + "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", + "suggestion-hyperband": "cmd/suggestion/hyperband/v1beta1/Dockerfile", + "suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile", + "suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile", + "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", + "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", + "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", + "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", + "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", + "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", + "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", + "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", + "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", } # Dict with Katib Experiments to run during the test. diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py new file mode 100644 index 00000000000..b694cbc9adb --- /dev/null +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -0,0 +1,47 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import utils + + +class TestTFEventMetricsCollector(unittest.TestCase): + def test_parse_file(self): + + current_dir = os.path.dirname(os.path.abspath(__file__)) + logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs") + + # Metric format is "{{dirname}}/{{metrics name}}" + metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"] + metric_logs = utils.get_metric_logs(logs_dir, metric_names) + self.assertEqual(20, len(metric_logs)) + + for log in metric_logs: + actual = log["metric"]["name"] + self.assertIn(actual, metric_names) + + # Metric format is "{{metrics name}}" + metric_names = ["accuracy", "loss"] + metrics_file_dir = os.path.join(logs_dir, "train") + metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names) + self.assertEqual(10, len(metric_logs)) + + for log in metric_logs: + actual = log["metric"]["name"] + self.assertIn(actual, metric_names) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/unit/v1beta1/metricscollector/utils.py b/test/unit/v1beta1/metricscollector/utils.py new file mode 100644 index 00000000000..228130c88fe --- /dev/null +++ b/test/unit/v1beta1/metricscollector/utils.py @@ -0,0 +1,23 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tfevent_loader import MetricsCollector +from google.protobuf import json_format + + +def get_metric_logs(logs_dir, metric_names): + mc = MetricsCollector(metric_names) + observation_log = mc.parse_file(logs_dir) + dict_observation_log = json_format.MessageToDict(observation_log) + return dict_observation_log["metricLogs"] diff --git a/test/unit/v1beta1/suggestion/test_chocolate_service.py b/test/unit/v1beta1/suggestion/test_chocolate_service.py index 75151e92668..0d811c322f8 100644 --- a/test/unit/v1beta1/suggestion/test_chocolate_service.py +++ b/test/unit/v1beta1/suggestion/test_chocolate_service.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import grpc import grpc_testing import unittest +import pytest from pkg.apis.manager.v1beta1.python import api_pb2 @@ -282,5 +285,16 @@ def test_validate_algorithm_settings(self): self.assertEqual(details, 'Max Trial Count: 15 > all possible search space combinations: 12') +@pytest.fixture(scope='function', autouse=True) +def tear_down(): + yield + working_dir = os.getcwd() + db_file = ["my_db.db", "my_db.db?check_same_thread=False.lock", "my_db.db-shm", "my_db.db-wal"] + for fname in db_file: + target_path = os.path.join(working_dir, fname) + if os.path.isfile(target_path): + os.remove(target_path) + + if __name__ == '__main__': unittest.main() diff --git a/test/unit/v1beta1/suggestion/test_enas_service.py b/test/unit/v1beta1/suggestion/test_enas_service.py index e7e8a087e41..e8c51451dcf 100644 --- a/test/unit/v1beta1/suggestion/test_enas_service.py +++ b/test/unit/v1beta1/suggestion/test_enas_service.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import shutil + import grpc import grpc_testing import unittest +import pytest + from pkg.apis.manager.v1beta1.python import api_pb2 from pkg.suggestion.v1beta1.nas.enas.service import EnasService @@ -191,5 +196,14 @@ def test_get_suggestion(self): self.assertEqual(2, len(response.parameter_assignments)) +@pytest.fixture(scope='function', autouse=True) +def tear_down(): + yield + working_dir = os.getcwd() + target_path = os.path.join(working_dir, "ctrl_cache") + if os.path.isdir(target_path): + shutil.rmtree(target_path) + + if __name__ == '__main__': unittest.main() diff --git a/test/unit/v1beta1/suggestion/utils.py b/test/unit/v1beta1/suggestion/utils.py index 99f02b76149..14f31ef80e0 100644 --- a/test/unit/v1beta1/suggestion/utils.py +++ b/test/unit/v1beta1/suggestion/utils.py @@ -1,3 +1,16 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pkg.apis.manager.v1beta1.python import api_pb2