From d5d6ed8c7ab4a569d3baafe3c30a316dd0481b60 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Fri, 12 Nov 2021 18:25:31 +0900 Subject: [PATCH] modify script to build container image --- Makefile | 7 +- cmd/suggestion/chocolate/v1beta1/Dockerfile | 3 + docs/developer-guide.md | 2 +- .../tfevent-metrics-collector.yaml | 49 ++++ .../enas-cnn-cifar10/Dockerfile.cpu | 4 +- .../enas-cnn-cifar10/Dockerfile.gpu | 4 +- .../trial-images/enas-cnn-cifar10/RunTrial.py | 4 +- .../enas-cnn-cifar10/requirements.txt | 1 - .../trial-images/tensorflow-mnist/Dockerfile | 14 ++ .../trial-images/tensorflow-mnist/README.md | 11 + .../tensorflow-mnist/mnist_with_summaries.py | 217 ++++++++++++++++++ scripts/v1beta1/build.sh | 89 ++++--- test/e2e/v1beta1/argo_workflow.py | 2 + 13 files changed, 359 insertions(+), 48 deletions(-) create mode 100644 examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml delete mode 100644 examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt create mode 100644 examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile create mode 100644 examples/v1beta1/trial-images/tensorflow-mnist/README.md create mode 100644 examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py diff --git a/Makefile b/Makefile index d036922f711..dcd39d057be 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ HAS_LINT := $(shell command -v golangci-lint;) COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD) KATIB_REGISTRY := docker.io/kubeflowkatib +CPU_ARCH ?= amd64 # Run tests .PHONY: test @@ -49,10 +50,10 @@ endif # Build images for the Katib v1beta1 components. build: generate -ifeq ($(and $(REGISTRY),$(TAG)),) - $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG=) +ifeq ($(and $(REGISTRY),$(TAG),$(CPU_ARCH)),) + $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG= CPU_ARCH=) endif - bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) + bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) $(CPU_ARCH) # Build and push Katib images from the latest master commit. push-latest: generate diff --git a/cmd/suggestion/chocolate/v1beta1/Dockerfile b/cmd/suggestion/chocolate/v1beta1/Dockerfile index 7d623fcb5ce..d1f095e874b 100644 --- a/cmd/suggestion/chocolate/v1beta1/Dockerfile +++ b/cmd/suggestion/chocolate/v1beta1/Dockerfile @@ -23,6 +23,9 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + sed -i -e '$a git+https://github.com/fmder/ghalton@master' -e '/^ghalton/d' requirements.txt; \ + fi; RUN pip install --no-cache-dir -r requirements.txt RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 32330c81231..f2282325413 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -13,7 +13,7 @@ see the following user guides: ## Requirements - [Go](https://golang.org/) (1.17 or later) -- [Docker](https://docs.docker.com/) (17.05 or later) +- [Docker](https://docs.docker.com/) (20.10 or later) - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later) - [Python](https://www.python.org/) (3.9 or later) - [kustomize](https://kustomize.io/) (4.0.5 or later) diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml new file mode 100644 index 00000000000..7e77897fc66 --- /dev/null +++ b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml @@ -0,0 +1,49 @@ +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: kubeflow + name: tfevent-metrics-collector +spec: + parallelTrialCount: 3 + maxTrialCount: 12 + maxFailedTrialCount: 3 + objective: + type: maximize + goal: 0.99 + objectiveMetricName: accuracy_1 + algorithm: + algorithmName: random + metricsCollectorSpec: + source: + fileSystemPath: + path: /train + kind: Directory + collector: + kind: TensorFlowEvent + parameters: + - name: learning_rate + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.05" + trialTemplate: + primaryContainerName: training-container + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: learning_rate + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/tensorflow-mnist:latest + command: + - "python3" + - "/opt/tensorflow-mnist/mnist_with_summaries.py" + - "--log_dir=/train/metrics" + - "--learning_rate=${trialParameters.learningRate}" + restartPolicy: Never diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu index 3710a59f8fc..524d08e2506 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu @@ -1,12 +1,10 @@ -FROM tensorflow/tensorflow:1.15.4-py3 +FROM tensorflow/tensorflow:2.7.0 ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu index 5020d01ad36..316ddf8a8fe 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu @@ -1,12 +1,10 @@ -FROM tensorflow/tensorflow:1.15.4-gpu-py3 +FROM tensorflow/tensorflow:2.7.0-gpu ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py index 4672e079a27..625f6174d62 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py @@ -2,8 +2,8 @@ import numpy as np from keras.datasets import cifar10 from ModelConstructor import ModelConstructor -from keras.utils import to_categorical -from keras.utils import multi_gpu_model +from tensorflow.keras.utils import to_categorical +from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from keras.preprocessing.image import ImageDataGenerator import argparse import time diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt deleted file mode 100644 index 1a23c027782..00000000000 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -keras==2.2.4 diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile b/examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile new file mode 100644 index 00000000000..24aa367a72f --- /dev/null +++ b/examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile @@ -0,0 +1,14 @@ +FROM tensorflow/tensorflow:2.7.0 + +ADD examples/v1beta1/trial-images/tensorflow-mnist /opt/tensorflow-mnist +WORKDIR /opt/tensorflow-mnist + +# Add folder for the logs. +RUN mkdir /katib + +RUN chgrp -R 0 /opt/tensorflow-mnist \ + && chmod -R g+rwX /opt/tensorflow-mnist \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/tensorflow-mnist/mnist_with_summaries.py"] diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/README.md b/examples/v1beta1/trial-images/tensorflow-mnist/README.md new file mode 100644 index 00000000000..56c75d68665 --- /dev/null +++ b/examples/v1beta1/trial-images/tensorflow-mnist/README.md @@ -0,0 +1,11 @@ +# Tensorflow MNIST Classification With Summaries Example + +This is Tensorflow MNIST image classification training container that outputs TF summaries. +It uses convolutional neural network to train the model. + +If you want to read more about this example, visit the official +[tensorflow](https://github.com/tensorflow/tensorflow/blob/7462dcaae1e8cfe1dfd0c62dd6083f9749a9d827/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py) +GitHub repository. + +Katib uses this training container in some Experiments, for instance in the +[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L55-L64). diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py b/examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py new file mode 100644 index 00000000000..04315ad8a3f --- /dev/null +++ b/examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py @@ -0,0 +1,217 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A simple MNIST classifier which displays summaries in TensorBoard. + +This is an unimpressive MNIST model, but it is a good example of using +tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of +naming summary tags so that they are grouped meaningfully in TensorBoard. + +It demonstrates the functionality of every TensorBoard dashboard. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys + +import tensorflow as tf + +from tensorflow.examples.tutorials.mnist import input_data + +FLAGS = None + + +def train(): + # Import data + mnist = input_data.read_data_sets(FLAGS.data_dir, + fake_data=FLAGS.fake_data) + + sess = tf.compat.v1.InteractiveSession() + # Create a multilayer model. + + # Input placeholders + with tf.compat.v1.name_scope('input'): + x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input') + y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input') + + with tf.compat.v1.name_scope('input_reshape'): + image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) + tf.compat.v1.summary.image('input', image_shaped_input, 10) + + # We can't initialize these variables to 0 - the network will get stuck. + def weight_variable(shape): + """Create a weight variable with appropriate initialization.""" + initial = tf.random.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + + def bias_variable(shape): + """Create a bias variable with appropriate initialization.""" + initial = tf.constant(0.1, shape=shape) + return tf.Variable(initial) + + def variable_summaries(var): + """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" + with tf.compat.v1.name_scope('summaries'): + mean = tf.reduce_mean(input_tensor=var) + tf.compat.v1.summary.scalar('mean', mean) + with tf.compat.v1.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean))) + tf.compat.v1.summary.scalar('stddev', stddev) + tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var)) + tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var)) + tf.compat.v1.summary.histogram('histogram', var) + + def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): + """Reusable code for making a simple neural net layer. + + It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. + It also sets up name scoping so that the resultant graph is easy to read, + and adds a number of summary ops. + """ + # Adding a name scope ensures logical grouping of the layers in the graph. + with tf.compat.v1.name_scope(layer_name): + # This Variable will hold the state of the weights for the layer + with tf.compat.v1.name_scope('weights'): + weights = weight_variable([input_dim, output_dim]) + variable_summaries(weights) + with tf.compat.v1.name_scope('biases'): + biases = bias_variable([output_dim]) + variable_summaries(biases) + with tf.compat.v1.name_scope('Wx_plus_b'): + preactivate = tf.matmul(input_tensor, weights) + biases + tf.compat.v1.summary.histogram('pre_activations', preactivate) + activations = act(preactivate, name='activation') + tf.compat.v1.summary.histogram('activations', activations) + return activations + + hidden1 = nn_layer(x, 784, 500, 'layer1') + + with tf.compat.v1.name_scope('dropout'): + keep_prob = tf.compat.v1.placeholder(tf.float32) + tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob) + dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob)) + + # Do not apply softmax activation yet, see below. + y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) + + with tf.compat.v1.name_scope('cross_entropy'): + # The raw formulation of cross-entropy, + # + # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)), + # reduction_indices=[1])) + # + # can be numerically unstable. + # + # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the + # raw logit outputs of the nn_layer above, and then average across + # the batch. + with tf.compat.v1.name_scope('total'): + cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( + labels=y_, logits=y) + tf.compat.v1.summary.scalar('cross_entropy', cross_entropy) + + with tf.compat.v1.name_scope('train'): + train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize( + cross_entropy) + + with tf.compat.v1.name_scope('accuracy'): + with tf.compat.v1.name_scope('correct_prediction'): + correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_) + with tf.compat.v1.name_scope('accuracy'): + accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction, + tf.float32)) + tf.compat.v1.summary.scalar('accuracy', accuracy) + + # Merge all the summaries and write them out to + # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) + merged = tf.compat.v1.summary.merge_all() + train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train', + sess.graph) + test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test') + tf.compat.v1.global_variables_initializer().run() + + # Train the model, and also write summaries. + # Every 10th step, measure test-set accuracy, and write test summaries + # All other steps, run train_step on training data, & add training summaries + + def feed_dict(train): + """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" + if train or FLAGS.fake_data: + xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) + k = FLAGS.dropout + else: + xs, ys = mnist.test.images, mnist.test.labels + k = 1.0 + return {x: xs, y_: ys, keep_prob: k} + + for i in range(FLAGS.max_steps): + if i % 10 == 0: # Record summaries and test-set accuracy + summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) + test_writer.add_summary(summary, i) + print('Accuracy at step %s: %s' % (i, acc)) + else: # Record train set summaries, and train + if i % 100 == 99: # Record execution stats + run_options = tf.compat.v1.RunOptions( + trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + run_metadata = tf.compat.v1.RunMetadata() + summary, _ = sess.run([merged, train_step], + feed_dict=feed_dict(True), + options=run_options, + run_metadata=run_metadata) + train_writer.add_run_metadata(run_metadata, 'step%03d' % i) + train_writer.add_summary(summary, i) + print('Adding run metadata for', i) + else: # Record a summary + summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) + train_writer.add_summary(summary, i) + train_writer.close() + test_writer.close() + + +def main(_): + if tf.io.gfile.exists(FLAGS.log_dir): + tf.io.gfile.rmtree(FLAGS.log_dir) + tf.io.gfile.makedirs(FLAGS.log_dir) + with tf.Graph().as_default(): + train() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--fake_data', nargs='?', const=True, type=bool, + default=False, + help='If true, uses fake data for unit testing.') + parser.add_argument('--max_steps', type=int, default=1000, + help='Number of steps to run trainer.') + parser.add_argument('--learning_rate', type=float, default=0.001, + help='Initial learning rate') + parser.add_argument('--dropout', type=float, default=0.9, + help='Keep probability for training dropout.') + parser.add_argument( + '--data_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/input_data'), + help='Directory for storing input data') + parser.add_argument( + '--log_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/logs/mnist_with_summaries'), + help='Summaries log directory') + FLAGS, unparsed = parser.parse_known_args() + tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 3773315b540..c40d3b7a643 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -21,97 +21,116 @@ set -e REGISTRY=$1 TAG=$2 +ARCH=$3 -if [[ -z "$REGISTRY" || -z "$TAG" ]]; then - echo "Image registry and tag must be set" - echo "Usage: $0 " 1>&2 +if [[ -z "$REGISTRY" || -z "$TAG" || -z "$ARCH" ]]; then + echo "Image registry, tag and architecture must be set" + echo "Usage: $0 " 1>&2 exit 1 fi +SUPPORTED_CPU_ARCHS=$(docker buildx inspect | grep 'Platforms' | sed -e 's|Platforms: ||' -e 's|,||g' -e 's|linux/||g') +function check_specified_cpu_arch() { + for SUPPORTED_ARCH in $SUPPORTED_CPU_ARCHS; do \ + if [ "$ARCH" = "$SUPPORTED_ARCH" ]; then \ + return 0 + fi; + done + echo "CPU architecture '$ARCH' is not supported" + echo "You can use '$SUPPORTED_CPU_ARCHS'" + return 1 +} +check_specified_cpu_arch + VERSION="v1beta1" CMD_PREFIX="cmd" -MACHINE_ARCH=$(uname -m) echo "Building images for Katib ${VERSION}..." echo "Image registry: ${REGISTRY}" echo "Image tag: ${TAG}" -SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/../.. -cd ${SCRIPT_ROOT} +SCRIPT_ROOT=$(dirname "$0")/../.. +cd "${SCRIPT_ROOT}" # Katib core images echo -e "\nBuilding Katib controller image...\n" -docker build -t ${REGISTRY}/katib-controller:${TAG} -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-controller:${TAG}" -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . echo -e "\nBuilding Katib DB manager image...\n" -docker build -t ${REGISTRY}/katib-db-manager:${TAG} -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . # TODO (andreyvelich): Switch to ${CMD_PREFIX}/ui/${VERSION}/Dockerfile once old UI is deprecated. echo -e "\nBuilding Katib UI image...\n" -docker build -t ${REGISTRY}/katib-ui:${TAG} -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . echo -e "\nBuilding Katib cert generator image...\n" -docker build -t ${REGISTRY}/cert-generator:${TAG} -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . echo -e "\nBuilding file metrics collector image...\n" -docker build -t ${REGISTRY}/file-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" -if [ $MACHINE_ARCH == "ppc64le" ]; then - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . -else - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . +if [ "$ARCH" == "ppc64le" ]; then + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . +else \ + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . fi # Suggestion images echo -e "\nBuilding suggestion images..." echo -e "\nBuilding hyperopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperopt:${TAG} -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperopt:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . echo -e "\nBuilding chocolate suggestion...\n" -docker build -t ${REGISTRY}/suggestion-chocolate:${TAG} -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-chocolate:${TAG}" -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . echo -e "\nBuilding hyperband suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperband:${TAG} -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperband:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . echo -e "\nBuilding skopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-skopt:${TAG} -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-skopt:${TAG}" -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . echo -e "\nBuilding goptuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-goptuna:${TAG} -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-goptuna:${TAG}" -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . echo -e "\nBuilding optuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-optuna:${TAG} -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-optuna:${TAG}" -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . echo -e "\nBuilding ENAS suggestion...\n" -docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-enas:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . echo -e "\nBuilding DARTS suggestion...\n" -docker build -t ${REGISTRY}/suggestion-darts:${TAG} -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-darts:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . # Early stopping images echo -e "\nBuilding early stopping images...\n" echo -e "\nBuilding median stopping rule...\n" -docker build -t ${REGISTRY}/earlystopping-medianstop:${TAG} -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/earlystopping-medianstop:${TAG}" -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . # Training container images -echo -e "\nBuilding training container images..." +if [ ! "$ARCH" = "amd64" ]; then \ + echo -e "\nTraining container images is supported only amd64." +else \ + + echo -e "\nBuilding training container images..." -echo -e "\nBuilding mxnet mnist training container example...\n" -docker build -t ${REGISTRY}/mxnet-mnist:${TAG} -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . + echo -e "\nBuilding mxnet mnist training container example...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . -echo -e "\nBuilding PyTorch mnist training container example...\n" -docker build -t ${REGISTRY}/pytorch-mnist:${TAG} -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . + echo -e "\nBuilding PyTorch mnist training container example...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . -echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" -docker build -t ${REGISTRY}/darts-cnn-cifar10:${TAG} -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/darts-cnn-cifar10:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + +fi echo -e "\nAll Katib images with ${TAG} tag have been built successfully!\n" diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 0345c262c9a..ae7f309017e 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -59,6 +59,7 @@ "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", + "trial-tensorflow-mnist": "examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile", "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", @@ -79,6 +80,7 @@ "pytorchjob": "examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml", "tfjob": "examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml", "file-metricscollector": "examples/v1beta1/metrics-collector/file-metrics-collector.yaml", + "tfevent-metricscollector": "examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml", "never-resume": "examples/v1beta1/resume-experiment/never-resume.yaml", "from-volume-resume": "examples/v1beta1/resume-experiment/from-volume-resume.yaml", "median-stop": "examples/v1beta1/early-stopping/median-stop.yaml"