From 167b9b9889ac5481b21cb35c6cdef5869b8ab713 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 13 Nov 2020 22:21:39 +0100 Subject: [PATCH] Simplifies check whether the CI image should be rebuilt (#12181) Rather than counting changed layers in the image (which was enigmatic, difficult and prone to some magic number) we rely now on random file generated while building the image. We are using the docker image caching mechanism here. The random file will be regenerated only when the previous layer (which is about installling Airflow dependencies for the first time) gets rebuild. And for us this is the indication, that the building the image will take quite some time. This layer should be relatively static - even if setup.py changes the CI image is designed in the way that the first time installation of Airflow dependencies is not invalidated. This should lead to faster and less frequent rebuild for people using Breeze and static checks. --- Dockerfile.ci | 9 +- breeze | 6 +- manifests/.gitignore | 2 +- scripts/ci/libraries/_build_images.sh | 163 ++++++++++++------------ scripts/ci/libraries/_initialization.sh | 13 +- 5 files changed, 103 insertions(+), 90 deletions(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index b61afd47dd77e..e69e36ef8d872 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -132,7 +132,9 @@ ARG RUNTIME_APT_DEPS="\ sqlite3 \ tmux \ unzip \ - vim" + vim \ + xxd" +ENV RUNTIME_APT_DEP=${RUNTIME_APT_DEPS} ARG ADDITIONAL_RUNTIME_APT_DEPS="" ENV ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS} @@ -278,6 +280,11 @@ RUN if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" ]]; then \ --constraint "${AIRFLOW_CONSTRAINTS_LOCATION}" && pip uninstall --yes apache-airflow; \ fi +# Generate random hex dump file so that we can determine whether it's faster to rebuild the image +# using current cache (when our dump is the same as the remote onb) or better to pull +# the new image (when it is different) +RUN head -c 30 /dev/urandom | xxd -ps >/build-cache-hash + # Link dumb-init for backwards compatibility (so that older images also work) RUN ln -sf /usr/bin/dumb-init /usr/local/bin/dumb-init diff --git a/breeze b/breeze index 31e02b69c246a..723fa920d5aa3 100755 --- a/breeze +++ b/breeze @@ -3040,11 +3040,9 @@ function breeze::run_build_command() { build_images::prepare_prod_build build_images::build_prod_images else + build_images::prepare_ci_build - md5sum::calculate_md5sum_for_all_files - build_images::build_ci_image - md5sum::update_all_md5 - build_images::build_ci_image_manifest + build_images::rebuild_ci_image_if_needed fi ;; cleanup_image | run_exec) diff --git a/manifests/.gitignore b/manifests/.gitignore index a6c57f5fb2ffb..72e8ffc0db8aa 100644 --- a/manifests/.gitignore +++ b/manifests/.gitignore @@ -1 +1 @@ -*.json +* diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index aeda4722a633b..f89b1e3bb2260 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -136,10 +136,18 @@ function build_images::confirm_image_rebuild() { ;; esac elif [[ -t 0 ]]; then + echo + echo + echo "Make sure that you rebased to latest master before rebuilding!" + echo # Check if this script is run interactively with stdin open and terminal attached "${AIRFLOW_SOURCES}/confirm" "${ACTION} image ${THE_IMAGE_TYPE}-python${PYTHON_MAJOR_MINOR_VERSION}" RES=$? elif [[ ${DETECTED_TERMINAL:=$(tty)} != "not a tty" ]]; then + echo > "${DETECTED_TERMINAL}" + echo > "${DETECTED_TERMINAL}" + echo "Make sure that you rebased to latest master before rebuilding!" > "${DETECTED_TERMINAL}" + echo > "${DETECTED_TERMINAL}" # Make sure to use output of tty rather than stdin/stdout when available - this way confirm # will works also in case of pre-commits (git does not pass stdin/stdout to pre-commit hooks) # shellcheck disable=SC2094 @@ -151,6 +159,10 @@ function build_images::confirm_image_rebuild() { export DETECTED_TERMINAL=/dev/tty # Make sure to use /dev/tty first rather than stdin/stdout when available - this way confirm # will works also in case of pre-commits (git does not pass stdin/stdout to pre-commit hooks) + echo > "${DETECTED_TERMINAL}" + echo > "${DETECTED_TERMINAL}" + echo "Make sure that you rebased to latest master before rebuilding!" > "${DETECTED_TERMINAL}" + echo > "${DETECTED_TERMINAL}" # shellcheck disable=SC2094 "${AIRFLOW_SOURCES}/confirm" "${ACTION} image ${THE_IMAGE_TYPE}-python${PYTHON_MAJOR_MINOR_VERSION}" \ <"${DETECTED_TERMINAL}" >"${DETECTED_TERMINAL}" @@ -193,113 +205,95 @@ function build_images::confirm_image_rebuild() { # We cannot use docker registry APIs as they are available only with authorisation # But this image can be pulled without authentication function build_images::build_ci_image_manifest() { - docker inspect "${AIRFLOW_CI_IMAGE}" >"manifests/${AIRFLOW_CI_BASE_TAG}.json" docker build \ - --build-arg AIRFLOW_CI_BASE_TAG="${AIRFLOW_CI_BASE_TAG}" \ --tag="${AIRFLOW_CI_LOCAL_MANIFEST_IMAGE}" \ -f- . </dev/null >/dev/null - # Create manifest from the local manifest image - if ! docker create --name "local-airflow-manifest" "${AIRFLOW_CI_LOCAL_MANIFEST_IMAGE}" 2>/dev/null; then - echo - echo "Local manifest image not available" - echo + docker rm --force "local-airflow-ci-container" 2>/dev/null >/dev/null + if ! docker create --name "local-airflow-ci-container" "${AIRFLOW_CI_IMAGE}" 2>/dev/null; then + >&2 echo + >&2 echo "Local airflow CI image not available" + >&2 echo LOCAL_MANIFEST_IMAGE_UNAVAILABLE="true" + export LOCAL_MANIFEST_IMAGE_UNAVAILABLE + touch "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}" return fi - # Create manifest from the local manifest image - docker cp "local-airflow-manifest:${AIRFLOW_CI_BASE_TAG}.json" "${TMP_MANIFEST_LOCAL_JSON}" - sed 's/ *//g' "${TMP_MANIFEST_LOCAL_JSON}" | grep '^"sha256:' >"${TMP_MANIFEST_LOCAL_SHA}" - docker rm --force "local-airflow-manifest" 2>/dev/null >/dev/null + docker cp "local-airflow-ci-container:/build-cache-hash" \ + "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}" 2> /dev/null \ + || touch "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}" set -e + echo + echo "Local build cache hash: '$(cat "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}")'" + echo } -# -# Retrieves information about layers in the remote IMAGE -# it stores list of SHAs of image layers in the file pointed at by TMP_MANIFEST_REMOTE_SHA -# This cannot be done easily with existing APIs of Dockerhub because they require additional authentication -# even for public images. Therefore instead we are downloading a specially prepared manifest image -# which is built together with the main image. This special manifest image is prepared during -# building of the main image and contains single JSON file being result of docker inspect on that image -# This image is from scratch so it is very tiny -function build_images::get_remote_image_info() { +# Retrieves information about the build cache hash random file from the remote image. +# We actually use manifest image for that, which is a really, really small image to pull! +# The problem is that inspecting information about remote image cannot be done easily with existing APIs +# of Dockerhub because they require additional authentication even for public images. +# Therefore instead we are downloading a specially prepared manifest image +# which is built together with the main image and pushed with it. This special manifest image is prepared +# during building of the main image and contains single file which is randomly built during the docker +# build in the right place in the image (right after installing all dependencies of Apache Airflow +# for the first time). When this random file gets regenerated it means that either base image has +# changed or some of the earlier layers was modified - which means that it is usually faster to pull +# that image first and then rebuild it - because this will likely be faster +function build_images::get_remote_image_build_cache_hash() { set +e # Pull remote manifest image if ! docker pull "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" 2>/dev/null >/dev/null; then - echo >&2 - echo >&2 "Remote docker registry unreachable" - echo >&2 + >&2 echo + >&2 echo "Remote docker registry unreachable" + >&2 echo REMOTE_DOCKER_REGISTRY_UNREACHABLE="true" + export REMOTE_DOCKER_REGISTRY_UNREACHABLE + touch "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}" return fi set -e - - # Docker needs the file passed to --cidfile to not exist, so we can't use mktemp - TMP_CONTAINER_DIR="$(mktemp -d)" - TMP_CONTAINER_ID="${TMP_CONTAINER_DIR}/remote-airflow-manifest-$$.container_id" - FILES_TO_CLEANUP_ON_EXIT+=("$TMP_CONTAINER_ID") - - TMP_MANIFEST_REMOTE_JSON=$(mktemp) - TMP_MANIFEST_REMOTE_SHA=$(mktemp) - # Create container out of the manifest image without running it - docker create --cidfile "${TMP_CONTAINER_ID}" "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" 2>/dev/null >/dev/null + # Create container dump out of the manifest image without actually running it + docker create --cidfile "${REMOTE_IMAGE_CONTAINER_ID_FILE}" "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" \ + 2>/dev/null >/dev/null || true # Extract manifest and store it in local file - docker cp "$(cat "${TMP_CONTAINER_ID}"):${AIRFLOW_CI_BASE_TAG}.json" "${TMP_MANIFEST_REMOTE_JSON}" - # Filter everything except SHAs of image layers - sed 's/ *//g' "${TMP_MANIFEST_REMOTE_JSON}" | grep '^"sha256:' >"${TMP_MANIFEST_REMOTE_SHA}" - docker rm --force "$(cat "${TMP_CONTAINER_ID}")" 2>/dev/null >/dev/null + docker cp "$(cat "${REMOTE_IMAGE_CONTAINER_ID_FILE}"):/build-cache-hash" \ + "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}" 2> /dev/null \ + || touch "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}" + docker rm --force "$(cat "${REMOTE_IMAGE_CONTAINER_ID_FILE}")" 2>/dev/null || true + echo + echo "Remote build cache hash: '$(cat "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}")'" + echo } -# The Number determines the cut-off between local building time and pull + build time. -# It is a bit experimental and it will have to be kept -# updated as we keep on changing layers. The cut-off point is at the moment when we do first -# pip install "https://github.com/apache/airflow/archive/${AIRFLOW_BRANCH}.tar... -# you can get it via this command: -# docker history --no-trunc apache/airflow:master-python3.6-ci | \ -# grep ^sha256 | grep -n "pip uninstall" | awk 'BEGIN {FS=":"} {print $1 }' -# -# This command returns the number of layer in docker history where pip uninstall is called. This is the -# line that will take a lot of time to run and at this point it's worth to pull the image from repo -# if there are at least NN changed layers in your docker file, you should pull the image. -# -# Note that this only matters if you have any of the important files changed since the last build -# of your image such as Dockerfile.ci, setup.py etc. -# -MAGIC_CUT_OFF_NUMBER_OF_LAYERS=36 - # Compares layers from both remote and local image and set FORCE_PULL_IMAGES to true in case # More than the last NN layers are different. -function build_images::compare_layers() { - NUM_DIFF=$(diff -y --suppress-common-lines "${TMP_MANIFEST_REMOTE_SHA}" "${TMP_MANIFEST_LOCAL_SHA}" | - wc -l || true) - rm -f "${TMP_MANIFEST_REMOTE_JSON}" "${TMP_MANIFEST_REMOTE_SHA}" "${TMP_MANIFEST_LOCAL_JSON}" "${TMP_MANIFEST_LOCAL_SHA}" - echo - echo "Number of layers different between the local and remote image: ${NUM_DIFF}" - echo - # This is where setup py is rebuilt - it will usually take a looooot of time to build it, so it is - # Better to pull here - if ((NUM_DIFF >= MAGIC_CUT_OFF_NUMBER_OF_LAYERS)); then +function build_images::compare_local_and_remote_build_cache_hash() { + set +e + local remote_hash + remote_hash=$(cat "${REMOTE_IMAGE_BUILD_CACHE_HASH_FILE}") + local local_hash + local_hash=$(cat "${LOCAL_IMAGE_BUILD_CACHE_HASH_FILE}") + + if [[ ${remote_hash} != "${local_hash}" || + ${local_hash} == "" ]]; then echo echo - echo "WARNING! Your image and the dockerhub image differ significantly" + echo "Your image and the dockerhub have different or missing build cache hashes." + echo "Local hash: '${local_hash}'. Remote hash: '${remote_hash}'." echo echo "Forcing pulling the images. It will be faster than rebuilding usually." echo "You can avoid it by setting SKIP_CHECK_REMOTE_IMAGE to true" @@ -307,9 +301,10 @@ function build_images::compare_layers() { export FORCE_PULL_IMAGES="true" else echo - echo "No need to pull the image. Local rebuild will be faster" + echo "No need to pull the image. Yours and remote cache hashes are the same!" echo fi + set -e } # Prints summary of the build parameters @@ -335,7 +330,6 @@ function build_images::get_docker_image_names() { # CI image to build export AIRFLOW_CI_IMAGE="${DOCKERHUB_USER}/${DOCKERHUB_REPO}:${AIRFLOW_CI_BASE_TAG}" - # Base production image tag - used to build kubernetes tag as well if [[ ${FORCE_AIRFLOW_PROD_BASE_TAG=} == "" ]]; then export AIRFLOW_PROD_BASE_TAG="${BRANCH_NAME}-python${PYTHON_MAJOR_MINOR_VERSION}" @@ -396,6 +390,9 @@ function build_images::prepare_ci_build() { # In case rebuild is needed, it determines (by comparing layers in local and remote image) # Whether pull is needed before rebuild. function build_images::rebuild_ci_image_if_needed() { + verbosity::print_info + verbosity::print_info "Checking if pull or just build for ${THE_IMAGE_TYPE} is needed." + verbosity::print_info if [[ -f "${BUILT_CI_IMAGE_FLAG_FILE}" ]]; then verbosity::print_info verbosity::print_info "${THE_IMAGE_TYPE} image already built locally." @@ -418,6 +415,7 @@ function build_images::rebuild_ci_image_if_needed() { local needs_docker_build="false" md5sum::check_if_docker_build_is_needed + build_images::get_local_build_cache_hash if [[ ${needs_docker_build} == "true" ]]; then if [[ ${SKIP_CHECK_REMOTE_IMAGE:=} != "true" && ${DOCKER_CACHE} == "pulled" ]]; then # Check if remote image is different enough to force pull @@ -427,14 +425,12 @@ function build_images::rebuild_ci_image_if_needed() { echo echo "Checking if the remote image needs to be pulled" echo - build_images::get_remote_image_info - if [[ ${REMOTE_DOCKER_REGISTRY_UNREACHABLE:=} != "true" ]]; then - build_images::get_local_image_info - if [[ ${LOCAL_MANIFEST_IMAGE_UNAVAILABLE:=} != "true" ]]; then - build_images::compare_layers - else - FORCE_PULL_IMAGES="true" - fi + build_images::get_remote_image_build_cache_hash + if [[ ${REMOTE_DOCKER_REGISTRY_UNREACHABLE:=} != "true" && \ + ${LOCAL_MANIFEST_IMAGE_UNAVAILABLE:=} != "true" ]]; then + build_images::compare_local_and_remote_build_cache_hash + else + FORCE_PULL_IMAGES="true" fi fi SKIP_REBUILD="false" @@ -453,6 +449,7 @@ function build_images::rebuild_ci_image_if_needed() { verbosity::print_info "Build start: ${THE_IMAGE_TYPE} image." verbosity::print_info build_images::build_ci_image + build_images::get_local_build_cache_hash md5sum::update_all_md5 build_images::build_ci_image_manifest verbosity::print_info diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 9bd8e34cc9bb6..f47a818832728 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -458,6 +458,12 @@ function initialization::initialize_test_variables() { export TEST_TYPE=${TEST_TYPE:=""} } +function initialization::initialize_build_image_variables() { + REMOTE_IMAGE_CONTAINER_ID_FILE="${AIRFLOW_SOURCES}/manifests/remote-airflow-manifest-image" + LOCAL_IMAGE_BUILD_CACHE_HASH_FILE="${AIRFLOW_SOURCES}/manifests/local-build-cache-hash" + REMOTE_IMAGE_BUILD_CACHE_HASH_FILE="${AIRFLOW_SOURCES}/manifests/remote-build-cache-hash" +} + # Common environment that is initialized by both Breeze and CI scripts function initialization::initialize_common_environment() { initialization::create_directories @@ -475,6 +481,7 @@ function initialization::initialize_common_environment() { initialization::initialize_git_variables initialization::initialize_github_variables initialization::initialize_test_variables + initialization::initialize_build_image_variables } function initialization::set_default_python_version_if_empty() { @@ -727,6 +734,10 @@ function initialization::make_constants_read_only() { readonly BUILT_CI_IMAGE_FLAG_FILE readonly INIT_SCRIPT_FILE + readonly REMOTE_IMAGE_CONTAINER_ID_FILE + readonly LOCAL_IMAGE_BUILD_CACHE_HASH_FILE + readonly REMOTE_IMAGE_BUILD_CACHE_HASH_FILE + } # converts parameters to json array @@ -749,6 +760,6 @@ function initialization::ga_output() { function initialization::ga_env() { if [[ ${GITHUB_ENV=} != "" ]]; then - echo "${1}=${2}" >> "${GITHUB_ENV}" + echo "${1}=${2}" >>"${GITHUB_ENV}" fi }