From 1c5570ca5b0a864cb3ec99be19616d88c61b588d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 11:47:48 -0700 Subject: [PATCH 1/2] reduce container image size --- CHANGES.md | 1 + .../runners/portability/sdk_container_builder.py | 8 +++++++- sdks/python/container/piputil.go | 10 +++++----- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 6ab0852f2b21..cd930f2aad55 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -62,6 +62,7 @@ ## New Features / Improvements +* Reduce the prebuilt Python container image size by not caching dependencies ([#27035](https://github.com/apache/beam/pull/27035)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes diff --git a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py index f81e015ea591..19becd3e123f 100644 --- a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py +++ b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py @@ -252,7 +252,13 @@ def _invoke_docker_build_and_push(self, container_image_name): build.steps = [] step = cloudbuild.BuildStep() step.name = 'gcr.io/kaniko-project/executor:latest' - step.args = ['--destination=' + container_image_name, '--cache=true'] + # Disable compression caching to allow for large images to be cached. + # See: https://github.com/GoogleContainerTools/kaniko/issues/1669 + step.args = [ + '--destination=' + container_image_name, + '--cache=true', + '--compressed-caching=false', + ] step.dir = SOURCE_FOLDER build.steps.append(step) diff --git a/sdks/python/container/piputil.go b/sdks/python/container/piputil.go index 03ac8325d6d0..a00e017445e3 100644 --- a/sdks/python/container/piputil.go +++ b/sdks/python/container/piputil.go @@ -37,14 +37,14 @@ func pipInstallRequirements(files []string, dir, name string) error { // as possible PyPI downloads. In the first round the --find-links // option will make sure that only things staged in the worker will be // used without following their dependencies. - args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} + args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} if err := execx.Execute("python", args...); err != nil { fmt.Println("Some packages could not be installed solely from the requirements cache. Installing packages from PyPI.") } // The second install round opens up the search for packages on PyPI and // also installs dependencies. The key is that if all the packages have // been installed in the first round then this command will be a no-op. - args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--find-links", dir} + args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--find-links", dir} return execx.Execute("python", args...) } } @@ -76,18 +76,18 @@ func pipInstallPackage(files []string, dir, name string, force, optional bool, e // installed version will match the package specified, the package itself // will not be reinstalled, but its dependencies will now be resolved and // installed if necessary. This achieves our goal outlined above. - args := []string{"-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", + args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", filepath.Join(dir, packageSpec)} err := execx.Execute("python", args...) if err != nil { return err } - args = []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args = []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } // Case when we do not perform a forced reinstall. - args := []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } } From f35a4aeb951e4aeb768a3f851a8e7ff6491e1c3d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 11:48:33 -0700 Subject: [PATCH 2/2] revert file --- .../runners/portability/sdk_container_builder.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py index 19becd3e123f..f81e015ea591 100644 --- a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py +++ b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py @@ -252,13 +252,7 @@ def _invoke_docker_build_and_push(self, container_image_name): build.steps = [] step = cloudbuild.BuildStep() step.name = 'gcr.io/kaniko-project/executor:latest' - # Disable compression caching to allow for large images to be cached. - # See: https://github.com/GoogleContainerTools/kaniko/issues/1669 - step.args = [ - '--destination=' + container_image_name, - '--cache=true', - '--compressed-caching=false', - ] + step.args = ['--destination=' + container_image_name, '--cache=true'] step.dir = SOURCE_FOLDER build.steps.append(step)