Skip to content

Commit

Permalink
modify script to build container image
Browse files Browse the repository at this point in the history
  • Loading branch information
tenzen-y committed Nov 12, 2021
1 parent 2392b88 commit 504d3c4
Show file tree
Hide file tree
Showing 13 changed files with 359 additions and 48 deletions.
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
HAS_LINT := $(shell command -v golangci-lint;)
COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD)
KATIB_REGISTRY := docker.io/kubeflowkatib
CPU_ARCH ?= amd64

# Run tests
.PHONY: test
Expand Down Expand Up @@ -49,10 +50,10 @@ endif

# Build images for the Katib v1beta1 components.
build: generate
ifeq ($(and $(REGISTRY),$(TAG)),)
$(error REGISTRY and TAG must be set. Usage: make build REGISTRY=<registry> TAG=<tag>)
ifeq ($(and $(REGISTRY),$(TAG),$(CPU_ARCH)),)
$(error REGISTRY and TAG must be set. Usage: make build REGISTRY=<registry> TAG=<tag> CPU_ARCH=<cpu-architecture>)
endif
bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG)
bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) $(CPU_ARCH)

# Build and push Katib images from the latest master commit.
push-latest: generate
Expand Down
3 changes: 3 additions & 0 deletions cmd/suggestion/chocolate/v1beta1/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \
ADD ./pkg/ ${TARGET_DIR}/pkg/
ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/
WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR}
RUN if [ "$(uname -m)" = "aarch64" ]; then \
sed -i -e '$a git+https://github.com/fmder/ghalton@master' -e '/^ghalton/d' requirements.txt; \
fi;
RUN pip install --no-cache-dir -r requirements.txt

RUN chgrp -R 0 ${TARGET_DIR} \
Expand Down
2 changes: 1 addition & 1 deletion docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ see the following user guides:
## Requirements

- [Go](https://golang.org/) (1.17 or later)
- [Docker](https://docs.docker.com/) (17.05 or later)
- [Docker](https://docs.docker.com/) (20.10 or later)
- [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later)
- [Python](https://www.python.org/) (3.9 or later)
- [kustomize](https://kustomize.io/) (4.0.5 or later)
Expand Down
49 changes: 49 additions & 0 deletions examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: tfevent-metrics-collector
spec:
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
objective:
type: maximize
goal: 0.99
objectiveMetricName: accuracy_1
algorithm:
algorithmName: random
metricsCollectorSpec:
source:
fileSystemPath:
path: /train
kind: Directory
collector:
kind: TensorFlowEvent
parameters:
- name: learning_rate
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.05"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: learning_rate
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/tensorflow-mnist:latest
command:
- "python3"
- "/opt/tensorflow-mnist/mnist_with_summaries.py"
- "--log_dir=/train/metrics"
- "--learning_rate=${trialParameters.learningRate}"
restartPolicy: Never
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
FROM tensorflow/tensorflow:1.15.4-py3
FROM tensorflow/tensorflow:2.7.0

ENV TARGET_DIR /opt/enas-cnn-cifar10

ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR}
WORKDIR ${TARGET_DIR}

RUN pip3 install --upgrade pip
RUN pip3 install --upgrade -r requirements.txt
ENV PYTHONPATH ${TARGET_DIR}

RUN chgrp -R 0 ${TARGET_DIR} \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
FROM tensorflow/tensorflow:1.15.4-gpu-py3
FROM tensorflow/tensorflow:2.7.0-gpu

ENV TARGET_DIR /opt/enas-cnn-cifar10

ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR}
WORKDIR ${TARGET_DIR}

RUN pip3 install --upgrade pip
RUN pip3 install --upgrade -r requirements.txt
ENV PYTHONPATH ${TARGET_DIR}

RUN chgrp -R 0 ${TARGET_DIR} \
Expand Down
4 changes: 2 additions & 2 deletions examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import numpy as np
from keras.datasets import cifar10
from ModelConstructor import ModelConstructor
from keras.utils import to_categorical
from keras.utils import multi_gpu_model
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
from keras.preprocessing.image import ImageDataGenerator
import argparse
import time
Expand Down

This file was deleted.

14 changes: 14 additions & 0 deletions examples/v1beta1/trial-images/tensorflow-mnist/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM tensorflow/tensorflow:2.7.0

ADD examples/v1beta1/trial-images/tensorflow-mnist /opt/tensorflow-mnist
WORKDIR /opt/tensorflow-mnist

# Add folder for the logs.
RUN mkdir /katib

RUN chgrp -R 0 /opt/tensorflow-mnist \
&& chmod -R g+rwX /opt/tensorflow-mnist \
&& chgrp -R 0 /katib \
&& chmod -R g+rwX /katib

ENTRYPOINT ["python3", "/opt/tensorflow-mnist/mnist_with_summaries.py"]
11 changes: 11 additions & 0 deletions examples/v1beta1/trial-images/tensorflow-mnist/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Tensorflow MNIST Classification With Summaries Example

This is Tensorflow MNIST image classification training container that outputs TF summaries.
It uses convolutional neural network to train the model.

If you want to read more about this example, visit the official
[tensorflow](https://github.com/tensorflow/tensorflow/blob/7462dcaae1e8cfe1dfd0c62dd6083f9749a9d827/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py)
GitHub repository.

Katib uses this training container in some Experiments, for instance in the
[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L55-L64).
217 changes: 217 additions & 0 deletions examples/v1beta1/trial-images/tensorflow-mnist/mnist_with_summaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A simple MNIST classifier which displays summaries in TensorBoard.
This is an unimpressive MNIST model, but it is a good example of using
tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
naming summary tags so that they are grouped meaningfully in TensorBoard.
It demonstrates the functionality of every TensorBoard dashboard.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys

import tensorflow as tf

from tensorflow.examples.tutorials.mnist import input_data

FLAGS = None


def train():
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir,
fake_data=FLAGS.fake_data)

sess = tf.compat.v1.InteractiveSession()
# Create a multilayer model.

# Input placeholders
with tf.compat.v1.name_scope('input'):
x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input')
y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input')

with tf.compat.v1.name_scope('input_reshape'):
image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
tf.compat.v1.summary.image('input', image_shaped_input, 10)

# We can't initialize these variables to 0 - the network will get stuck.
def weight_variable(shape):
"""Create a weight variable with appropriate initialization."""
initial = tf.random.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)

def bias_variable(shape):
"""Create a bias variable with appropriate initialization."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)

def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.compat.v1.name_scope('summaries'):
mean = tf.reduce_mean(input_tensor=var)
tf.compat.v1.summary.scalar('mean', mean)
with tf.compat.v1.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean)))
tf.compat.v1.summary.scalar('stddev', stddev)
tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var))
tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var))
tf.compat.v1.summary.histogram('histogram', var)

def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
"""Reusable code for making a simple neural net layer.
It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
It also sets up name scoping so that the resultant graph is easy to read,
and adds a number of summary ops.
"""
# Adding a name scope ensures logical grouping of the layers in the graph.
with tf.compat.v1.name_scope(layer_name):
# This Variable will hold the state of the weights for the layer
with tf.compat.v1.name_scope('weights'):
weights = weight_variable([input_dim, output_dim])
variable_summaries(weights)
with tf.compat.v1.name_scope('biases'):
biases = bias_variable([output_dim])
variable_summaries(biases)
with tf.compat.v1.name_scope('Wx_plus_b'):
preactivate = tf.matmul(input_tensor, weights) + biases
tf.compat.v1.summary.histogram('pre_activations', preactivate)
activations = act(preactivate, name='activation')
tf.compat.v1.summary.histogram('activations', activations)
return activations

hidden1 = nn_layer(x, 784, 500, 'layer1')

with tf.compat.v1.name_scope('dropout'):
keep_prob = tf.compat.v1.placeholder(tf.float32)
tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob)
dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob))

# Do not apply softmax activation yet, see below.
y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)

with tf.compat.v1.name_scope('cross_entropy'):
# The raw formulation of cross-entropy,
#
# tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)),
# reduction_indices=[1]))
#
# can be numerically unstable.
#
# So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the
# raw logit outputs of the nn_layer above, and then average across
# the batch.
with tf.compat.v1.name_scope('total'):
cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
labels=y_, logits=y)
tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)

with tf.compat.v1.name_scope('train'):
train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize(
cross_entropy)

with tf.compat.v1.name_scope('accuracy'):
with tf.compat.v1.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_)
with tf.compat.v1.name_scope('accuracy'):
accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
tf.float32))
tf.compat.v1.summary.scalar('accuracy', accuracy)

# Merge all the summaries and write them out to
# /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
merged = tf.compat.v1.summary.merge_all()
train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train',
sess.graph)
test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test')
tf.compat.v1.global_variables_initializer().run()

# Train the model, and also write summaries.
# Every 10th step, measure test-set accuracy, and write test summaries
# All other steps, run train_step on training data, & add training summaries

def feed_dict(train):
"""Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
if train or FLAGS.fake_data:
xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
k = FLAGS.dropout
else:
xs, ys = mnist.test.images, mnist.test.labels
k = 1.0
return {x: xs, y_: ys, keep_prob: k}

for i in range(FLAGS.max_steps):
if i % 10 == 0: # Record summaries and test-set accuracy
summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
test_writer.add_summary(summary, i)
print('Accuracy at step %s: %s' % (i, acc))
else: # Record train set summaries, and train
if i % 100 == 99: # Record execution stats
run_options = tf.compat.v1.RunOptions(
trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
run_metadata = tf.compat.v1.RunMetadata()
summary, _ = sess.run([merged, train_step],
feed_dict=feed_dict(True),
options=run_options,
run_metadata=run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
train_writer.add_summary(summary, i)
print('Adding run metadata for', i)
else: # Record a summary
summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
train_writer.add_summary(summary, i)
train_writer.close()
test_writer.close()


def main(_):
if tf.io.gfile.exists(FLAGS.log_dir):
tf.io.gfile.rmtree(FLAGS.log_dir)
tf.io.gfile.makedirs(FLAGS.log_dir)
with tf.Graph().as_default():
train()


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
default=False,
help='If true, uses fake data for unit testing.')
parser.add_argument('--max_steps', type=int, default=1000,
help='Number of steps to run trainer.')
parser.add_argument('--learning_rate', type=float, default=0.001,
help='Initial learning rate')
parser.add_argument('--dropout', type=float, default=0.9,
help='Keep probability for training dropout.')
parser.add_argument(
'--data_dir',
type=str,
default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
'tensorflow/mnist/input_data'),
help='Directory for storing input data')
parser.add_argument(
'--log_dir',
type=str,
default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
'tensorflow/mnist/logs/mnist_with_summaries'),
help='Summaries log directory')
FLAGS, unparsed = parser.parse_known_args()
tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
Loading

0 comments on commit 504d3c4

Please sign in to comment.