Skip to content

Commit

Permalink
Fix our continuous release process (#271)
Browse files Browse the repository at this point in the history
* Fix our continuous release process which is broken (#270)
   * Our releaser has been crash looping and not released any new images since 12/23.

* Docker image used to build releases needs YARN and NodeJs to build the FE.

* We refactored release.py and split cloning and building the code into two steps in #189. So we had to refactor how we build from the latest green to account for these changes.
   * release.py no longer has a function to periodically check the latest post submit and rebuild if 
     necessary
   * Created a simple shell script launch_build.sh to check out the latest green commit and then build
     the code; We invoke the build function in the code checked out.
  • Loading branch information
jlewi authored Jan 8, 2018
1 parent da1a861 commit 9c93d5f
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 25 deletions.
40 changes: 22 additions & 18 deletions py/release.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,11 +442,13 @@ def clone_lastgreen(args):
util.clone_repo(args.src_dir, util.MASTER_REPO_OWNER, util.MASTER_REPO_NAME,
sha)

# TODO(jlewi): Delete this function once
# https://github.com/tensorflow/k8s/issues/189 is fixed.
def build_lastgreen(args): # pylint: disable=too-many-locals
"""Find the latest green postsubmit and build the artifacts.
def build_new_release(args): # pylint: disable=too-many-locals
"""Find the latest release and build the artifacts if they are newer then
the current release.
"""
if not args.src_dir:
raise ValueError("src_dir must be provided when building last green.")

gcs_client = storage.Client()
sha = get_latest_green_presubmit(gcs_client)

Expand All @@ -458,18 +460,13 @@ def build_lastgreen(args): # pylint: disable=too-many-locals
last_release_sha = get_last_release(bucket)
logging.info("Most recent release was for %s", last_release_sha)

sha = build_and_push_image.GetGitHash(args.src_dir)

if sha == last_release_sha:
logging.info("Already cut release for %s", sha)
return

go_dir = tempfile.mkdtemp(prefix="tmpTfJobSrc")
logging.info("Temporary go_dir: %s", go_dir)

src_dir = os.path.join(go_dir, "src", "github.com", REPO_ORG, REPO_NAME)

_, sha = util.clone_repo(src_dir, util.MASTER_REPO_OWNER,
util.MASTER_REPO_NAME, sha)
build_and_push(go_dir, src_dir, args)
build(args)

def add_common_args(parser):
"""Add a set of common parser arguments."""
Expand Down Expand Up @@ -622,13 +619,20 @@ def build_parser():
help="(Optional) Directory to checkout the source to.")

############################################################################
# Last Green
parser_lastgreen = subparsers.add_parser(
"lastgreen",
help=("Build the artifacts from the latst green postsubmit. "
"Will not rebuild the artifacts if they have already been built."))
# Build new release
build_new = subparsers.add_parser(
"build_new_release",
help=("Build a new release. Only builds it if its newer than current "
"release."))

build_new.add_argument(
"--src_dir",
default=None,
type=str,
help=("Directory containing the source. "))

add_common_args(parser_lastgreen)
add_common_args(build_new)
build_new.set_defaults(func=build_new_release)

############################################################################
# Pull Request
Expand Down
6 changes: 4 additions & 2 deletions py/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,13 +428,15 @@ def setup_cluster(api_client):
class TimeoutError(Exception):
"""An error indicating an operation timed out."""

GCS_REGEX = re.compile("gs://([^/]*)/(.*)")
GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")

def split_gcs_uri(gcs_uri):
"""Split a GCS URI into bucket and path."""
m = GCS_REGEX.match(gcs_uri)
bucket = m.group(1)
path = m.group(2)
path = ""
if m.group(2):
path = m.group(2).lstrip("/")
return bucket, path

def _refresh_credentials():
Expand Down
9 changes: 9 additions & 0 deletions py/util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,14 @@ def test_wait_for_statefulset(self):
result = util.wait_for_statefulset(api_client, "some-namespace", "some-set")
self.assertIsNotNone(result)

def testSplitGcsUri(self):
bucket, path = util.split_gcs_uri("gs://some-bucket/some/path")
self.assertEquals("some-bucket", bucket)
self.assertEquals("some/path", path)

bucket, path = util.split_gcs_uri("gs://some-bucket")
self.assertEquals("some-bucket", bucket)
self.assertEquals("", path)

if __name__ == "__main__":
unittest.main()
26 changes: 25 additions & 1 deletion release/Dockerfile.release
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO(jlewi): We should consider reusing the same DockerFiles as used to create
# the containers used by our test infrastructure. We should probably wait though
# until we get rid of Airflow in our tests.
# However, we don't actually want to use the same Docker images because there
# are broader permissions on our images used in testing and we want to have much
# tighter controls on the code used to build our releases.

# This Dockerfile is used to create a docker image suitable for building
# and releasing the TfJob operator.
FROM golang:1.8.2
Expand Down Expand Up @@ -44,8 +51,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& apt-get clean

RUN easy_install pip
# TODO(jlewi): Use pipfile to pull in requirements as specified in the build. See the developer guide
# for instructions on using pipenv. When I tried adding the instructions I got errors trying to
# activate the virtual env by running pipenv shell.
RUN pip install --upgrade six pyyaml google-api-python-client \
google-cloud-storage pylint
google-cloud-storage pylint jinja2 \
google-auth-httplib2 kubernetes==4.0.0 mock retrying

# Install gcloud

Expand Down Expand Up @@ -87,7 +98,20 @@ RUN wget -O /tmp/get_helm.sh \
# Initialize helm
RUN helm init --client-only

# Install Node.js
RUN curl -sL https://deb.nodesource.com/setup_8.x | bash - \
&& apt-get install -y nodejs

# Install yarn
RUN curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \
&& echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \
&& apt-get update -yqq \
&& apt-get install -yqq --no-install-recommends yarn

RUN mkdir -p /opt/tf_k8s_releaser/py
COPY py /opt/tf_k8s_releaser/py

COPY launch_build.sh /opt/tf_k8s_releaser/
RUN chmod a+x /opt/tf_k8s_releaser/launch_build.sh

ADD ["version.json", "/opt/tf_k8s_releaser/py"]
29 changes: 29 additions & 0 deletions release/launch_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -ex

while :; do
echo starting build cycle
SRC_DIR=`mktemp -d /tmp/tfk8s.src.tmp.XXXXXX`

# We should be in the working directory where the copy of py is baked into
# the container
cd /opt/tf_k8s_releaser/
python -m py.release clone --src_dir=${SRC_DIR} lastgreen

GOPATH=${SRC_DIR}/go
mkdir -p ${GOPATH}

# Change to the directory we just cloned so that we pull the code from
# the code we just checkout.
# TODO(jlewi): Uncomment before submitting
cd ${SRC_DIR}
python -m py.release build_new_release \
--src_dir=${SRC_DIR} \
--registry=gcr.io/tf-on-k8s-dogfood \
--project=tf-on-k8s-releasing \
--releases_path=gs://tf-on-k8s-dogfood-releases

rm -rf ${SRC_DIR}

sleep 300
done
5 changes: 1 addition & 4 deletions release/releaser.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,4 @@ spec:
image: gcr.io/tf-on-k8s-releasing/releaser:latest
workingDir: /opt/tf_k8s_releaser
command:
- python
- -m
- py.release
- --check_interval_secs=3600
- /opt/tf_k8s_releaser/launch_build.sh

0 comments on commit 9c93d5f

Please sign in to comment.