modify script to build container image

kubeflow · Nov 12, 2021 · d5d6ed8 · d5d6ed8
1 parent 2392b88
commit d5d6ed8
Show file tree

Hide file tree

Showing 13 changed files with 359 additions and 48 deletions.
diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 HAS_LINT := $(shell command -v golangci-lint;)
 COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD)
 KATIB_REGISTRY := docker.io/kubeflowkatib
+CPU_ARCH ?= amd64
 
 # Run tests
 .PHONY: test
@@ -49,10 +50,10 @@ endif
 
 # Build images for the Katib v1beta1 components.
 build: generate
-ifeq ($(and $(REGISTRY),$(TAG)),)
-	$(error REGISTRY and TAG must be set. Usage: make build REGISTRY=<registry> TAG=<tag>)
+ifeq ($(and $(REGISTRY),$(TAG),$(CPU_ARCH)),)
+	$(error REGISTRY and TAG must be set. Usage: make build REGISTRY=<registry> TAG=<tag> CPU_ARCH=<cpu-architecture>)
 endif
-	bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG)
+	bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) $(CPU_ARCH)
 
 # Build and push Katib images from the latest master commit.
 push-latest: generate

diff --git a/cmd/suggestion/chocolate/v1beta1/Dockerfile b/cmd/suggestion/chocolate/v1beta1/Dockerfile
@@ -23,6 +23,9 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \
 ADD ./pkg/ ${TARGET_DIR}/pkg/
 ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/
 WORKDIR  ${TARGET_DIR}/${SUGGESTION_DIR}
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+    sed -i -e '$a git+https://github.com/fmder/ghalton@master' -e '/^ghalton/d' requirements.txt; \
+    fi;
 RUN pip install --no-cache-dir -r requirements.txt
 
 RUN chgrp -R 0 ${TARGET_DIR} \

diff --git a/docs/developer-guide.md b/docs/developer-guide.md
@@ -13,7 +13,7 @@ see the following user guides:
 ## Requirements
 
 - [Go](https://golang.org/) (1.17 or later)
-- [Docker](https://docs.docker.com/) (17.05 or later)
+- [Docker](https://docs.docker.com/) (20.10 or later)
 - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later)
 - [Python](https://www.python.org/) (3.9 or later)
 - [kustomize](https://kustomize.io/) (4.0.5 or later)

diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml
@@ -0,0 +1,49 @@
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: tfevent-metrics-collector
+spec:
+  parallelTrialCount: 3
+  maxTrialCount: 12
+  maxFailedTrialCount: 3
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: accuracy_1
+  algorithm:
+    algorithmName: random
+  metricsCollectorSpec:
+    source:
+      fileSystemPath:
+        path: /train
+        kind: Directory
+    collector:
+      kind: TensorFlowEvent
+  parameters:
+    - name: learning_rate
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.05"
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: learning_rate
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: docker.io/kubeflowkatib/tensorflow-mnist:latest
+                command:
+                  - "python3"
+                  - "/opt/tensorflow-mnist/mnist_with_summaries.py"
+                  - "--log_dir=/train/metrics"
+                  - "--learning_rate=${trialParameters.learningRate}"
+            restartPolicy: Never
diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu
@@ -1,12 +1,10 @@
-FROM tensorflow/tensorflow:1.15.4-py3
+FROM tensorflow/tensorflow:2.7.0
 
 ENV TARGET_DIR /opt/enas-cnn-cifar10
 
 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR}
 WORKDIR  ${TARGET_DIR}
 
-RUN pip3 install --upgrade pip
-RUN pip3 install --upgrade -r requirements.txt
 ENV PYTHONPATH ${TARGET_DIR}
 
 RUN chgrp -R 0 ${TARGET_DIR} \

diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu
@@ -1,12 +1,10 @@
-FROM tensorflow/tensorflow:1.15.4-gpu-py3
+FROM tensorflow/tensorflow:2.7.0-gpu
 
 ENV TARGET_DIR /opt/enas-cnn-cifar10
 
 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR}
 WORKDIR  ${TARGET_DIR}
 
-RUN pip3 install --upgrade pip
-RUN pip3 install --upgrade -r requirements.txt
 ENV PYTHONPATH ${TARGET_DIR}
 
 RUN chgrp -R 0 ${TARGET_DIR} \

diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py
@@ -2,8 +2,8 @@
 import numpy as np
 from keras.datasets import cifar10
 from ModelConstructor import ModelConstructor
-from keras.utils import to_categorical
-from keras.utils import multi_gpu_model
+from tensorflow.keras.utils import to_categorical
+from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from keras.preprocessing.image import ImageDataGenerator
 import argparse
 import time

diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt
diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile b/examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile
@@ -0,0 +1,14 @@
+FROM tensorflow/tensorflow:2.7.0
+
+ADD examples/v1beta1/trial-images/tensorflow-mnist /opt/tensorflow-mnist
+WORKDIR /opt/tensorflow-mnist
+
+# Add folder for the logs.
+RUN mkdir /katib
+
+RUN chgrp -R 0 /opt/tensorflow-mnist \
+  && chmod -R g+rwX /opt/tensorflow-mnist \
+  && chgrp -R 0 /katib \
+  && chmod -R g+rwX /katib
+
+ENTRYPOINT ["python3", "/opt/tensorflow-mnist/mnist_with_summaries.py"]
diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/README.md b/examples/v1beta1/trial-images/tensorflow-mnist/README.md
@@ -0,0 +1,11 @@
+# Tensorflow MNIST Classification With Summaries Example
+
+This is Tensorflow MNIST image classification training container that outputs TF summaries.
+It uses convolutional neural network to train the model.
+
+If you want to read more about this example, visit the official
+[tensorflow](https://github.com/tensorflow/tensorflow/blob/7462dcaae1e8cfe1dfd0c62dd6083f9749a9d827/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py)
+GitHub repository.
+
+Katib uses this training container in some Experiments, for instance in the
+[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L55-L64).
diff --git a/examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py b/examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py
@@ -0,0 +1,217 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple MNIST classifier which displays summaries in TensorBoard.
+
+This is an unimpressive MNIST model, but it is a good example of using
+tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
+naming summary tags so that they are grouped meaningfully in TensorBoard.
+
+It demonstrates the functionality of every TensorBoard dashboard.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.examples.tutorials.mnist import input_data
+
+FLAGS = None
+
+
+def train():
+  # Import data
+  mnist = input_data.read_data_sets(FLAGS.data_dir,
+                                    fake_data=FLAGS.fake_data)
+
+  sess = tf.compat.v1.InteractiveSession()
+  # Create a multilayer model.
+
+  # Input placeholders
+  with tf.compat.v1.name_scope('input'):
+    x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input')
+    y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input')
+
+  with tf.compat.v1.name_scope('input_reshape'):
+    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
+    tf.compat.v1.summary.image('input', image_shaped_input, 10)
+
+  # We can't initialize these variables to 0 - the network will get stuck.
+  def weight_variable(shape):
+    """Create a weight variable with appropriate initialization."""
+    initial = tf.random.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+  def bias_variable(shape):
+    """Create a bias variable with appropriate initialization."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+  def variable_summaries(var):
+    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
+    with tf.compat.v1.name_scope('summaries'):
+      mean = tf.reduce_mean(input_tensor=var)
+      tf.compat.v1.summary.scalar('mean', mean)
+      with tf.compat.v1.name_scope('stddev'):
+        stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean)))
+      tf.compat.v1.summary.scalar('stddev', stddev)
+      tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var))
+      tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var))
+      tf.compat.v1.summary.histogram('histogram', var)
+
+  def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
+    """Reusable code for making a simple neural net layer.
+
+    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
+    It also sets up name scoping so that the resultant graph is easy to read,
+    and adds a number of summary ops.
+    """
+    # Adding a name scope ensures logical grouping of the layers in the graph.
+    with tf.compat.v1.name_scope(layer_name):
+      # This Variable will hold the state of the weights for the layer
+      with tf.compat.v1.name_scope('weights'):
+        weights = weight_variable([input_dim, output_dim])
+        variable_summaries(weights)
+      with tf.compat.v1.name_scope('biases'):
+        biases = bias_variable([output_dim])
+        variable_summaries(biases)
+      with tf.compat.v1.name_scope('Wx_plus_b'):
+        preactivate = tf.matmul(input_tensor, weights) + biases
+        tf.compat.v1.summary.histogram('pre_activations', preactivate)
+      activations = act(preactivate, name='activation')
+      tf.compat.v1.summary.histogram('activations', activations)
+      return activations
+
+  hidden1 = nn_layer(x, 784, 500, 'layer1')
+
+  with tf.compat.v1.name_scope('dropout'):
+    keep_prob = tf.compat.v1.placeholder(tf.float32)
+    tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob)
+    dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob))
+
+  # Do not apply softmax activation yet, see below.
+  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
+
+  with tf.compat.v1.name_scope('cross_entropy'):
+    # The raw formulation of cross-entropy,
+    #
+    # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)),
+    #                               reduction_indices=[1]))
+    #
+    # can be numerically unstable.
+    #
+    # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the
+    # raw logit outputs of the nn_layer above, and then average across
+    # the batch.
+    with tf.compat.v1.name_scope('total'):
+      cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
+          labels=y_, logits=y)
+  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
+
+  with tf.compat.v1.name_scope('train'):
+    train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize(
+        cross_entropy)
+
+  with tf.compat.v1.name_scope('accuracy'):
+    with tf.compat.v1.name_scope('correct_prediction'):
+      correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_)
+    with tf.compat.v1.name_scope('accuracy'):
+      accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
+                                                     tf.float32))
+  tf.compat.v1.summary.scalar('accuracy', accuracy)
+
+  # Merge all the summaries and write them out to
+  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
+  merged = tf.compat.v1.summary.merge_all()
+  train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train',
+                                                 sess.graph)
+  test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test')
+  tf.compat.v1.global_variables_initializer().run()
+
+  # Train the model, and also write summaries.
+  # Every 10th step, measure test-set accuracy, and write test summaries
+  # All other steps, run train_step on training data, & add training summaries
+
+  def feed_dict(train):
+    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
+    if train or FLAGS.fake_data:
+      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
+      k = FLAGS.dropout
+    else:
+      xs, ys = mnist.test.images, mnist.test.labels
+      k = 1.0
+    return {x: xs, y_: ys, keep_prob: k}
+
+  for i in range(FLAGS.max_steps):
+    if i % 10 == 0:  # Record summaries and test-set accuracy
+      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
+      test_writer.add_summary(summary, i)
+      print('Accuracy at step %s: %s' % (i, acc))
+    else:  # Record train set summaries, and train
+      if i % 100 == 99:  # Record execution stats
+        run_options = tf.compat.v1.RunOptions(
+            trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        run_metadata = tf.compat.v1.RunMetadata()
+        summary, _ = sess.run([merged, train_step],
+                              feed_dict=feed_dict(True),
+                              options=run_options,
+                              run_metadata=run_metadata)
+        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
+        train_writer.add_summary(summary, i)
+        print('Adding run metadata for', i)
+      else:  # Record a summary
+        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+        train_writer.add_summary(summary, i)
+  train_writer.close()
+  test_writer.close()
+
+
+def main(_):
+  if tf.io.gfile.exists(FLAGS.log_dir):
+    tf.io.gfile.rmtree(FLAGS.log_dir)
+  tf.io.gfile.makedirs(FLAGS.log_dir)
+  with tf.Graph().as_default():
+    train()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
+                      default=False,
+                      help='If true, uses fake data for unit testing.')
+  parser.add_argument('--max_steps', type=int, default=1000,
+                      help='Number of steps to run trainer.')
+  parser.add_argument('--learning_rate', type=float, default=0.001,
+                      help='Initial learning rate')
+  parser.add_argument('--dropout', type=float, default=0.9,
+                      help='Keep probability for training dropout.')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/input_data'),
+      help='Directory for storing input data')
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/logs/mnist_with_summaries'),
+      help='Summaries log directory')
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)