From 6cf2725006cfaebff7c6dc77a7cce9f8b5322ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jiri=20Dan=C4=9Bk?= Date: Thu, 13 Jun 2024 09:57:39 +0200 Subject: [PATCH] Limit PR checks to build only the modified images --- .github/workflows/build-notebooks-pr.yaml | 20 +++- Makefile | 1 + ci/cached-builds/gen_gha_matrix_jobs.py | 43 +++++++- ci/cached-builds/gha_pr_changed_files.py | 126 ++++++++++++++++++++++ 4 files changed, 185 insertions(+), 5 deletions(-) create mode 100644 ci/cached-builds/gha_pr_changed_files.py diff --git a/.github/workflows/build-notebooks-pr.yaml b/.github/workflows/build-notebooks-pr.yaml index d04d90a6c..fd93b545d 100644 --- a/.github/workflows/build-notebooks-pr.yaml +++ b/.github/workflows/build-notebooks-pr.yaml @@ -1,28 +1,40 @@ --- "name": "Build Notebooks" -"permissions": - "packages": "read" "on": "pull_request": +permissions: + contents: read + packages: read + pull-requests: read + jobs: gen: name: Generate job matrix runs-on: ubuntu-latest outputs: matrix: ${{ steps.gen.outputs.matrix }} + has_jobs: ${{ steps.gen.outputs.has_jobs }} steps: - uses: actions/checkout@v4 - - run: python3 ci/cached-builds/gen_gha_matrix_jobs.py + + - run: | + python3 ci/cached-builds/gen_gha_matrix_jobs.py \ + --owner=${{ github.repository_owner }} \ + --repo=${{ github.event.pull_request.base.repo.name }} \ + --pr-number=${{ github.event.pull_request.number }} \ + --skip-unchanged id: gen + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # base images build: needs: ["gen"] strategy: fail-fast: false matrix: "${{ fromJson(needs.gen.outputs.matrix) }}" uses: ./.github/workflows/build-notebooks-TEMPLATE.yaml + if: ${{ fromJson(needs.gen.outputs.has_jobs) }} with: target: "${{ matrix.target }}" github: "${{ toJSON(github) }}" diff --git a/Makefile b/Makefile index 83e9fcb50..ac28bfe8c 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,7 @@ endef # ARG 2: Path of image context we want to build. # ARG 3: Base image tag name (optional). define image + $(info #*# Image build directory: <$(2)> #(MACHINE-PARSED LINE)#*#...) $(call build_image,$(1),$(2),$(3)) $(call push_image,$(1)) endef diff --git a/ci/cached-builds/gen_gha_matrix_jobs.py b/ci/cached-builds/gen_gha_matrix_jobs.py index 867577932..7a4746275 100644 --- a/ci/cached-builds/gen_gha_matrix_jobs.py +++ b/ci/cached-builds/gen_gha_matrix_jobs.py @@ -1,11 +1,17 @@ +import argparse import itertools import json +import logging import os import pathlib import re import string +import sys +import unittest from typing import Iterable +import gha_pr_changed_files + """Trivial Makefile parser that extracts target dependencies so that we can build each Dockerfile image target in its own GitHub Actions job and handle dependencies between them. @@ -115,11 +121,13 @@ def write_github_workflow_file(tree: dict[str, list[str]], path: pathlib.Path) - def flatten(list_of_lists): return list(itertools.chain.from_iterable(list_of_lists)) + def compute_leafs_in_dependency_tree(tree: dict[str, list[str]]) -> list[str]: key_set = set(tree.keys()) value_set = set(flatten(tree.values())) return [key for key in key_set if key not in value_set] + def print_github_actions_pr_matrix(tree: dict[str, list[str]], leafs: list[str]) -> list[str]: """Outputs GitHub matrix definition Json """ @@ -136,10 +144,24 @@ def print_github_actions_pr_matrix(tree: dict[str, list[str]], leafs: list[str]) targets.append(leaf) matrix = {"target": targets} - return [f"matrix={json.dumps(matrix, separators=(',', ':'))}"] + return [f"matrix={json.dumps(matrix, separators=(',', ':'))}", + f"has_jobs={json.dumps(len(leafs) > 0, separators=(',', ':'))}"] def main() -> None: + logging.basicConfig(level=logging.DEBUG, stream=sys.stderr) + + argparser = argparse.ArgumentParser() + argparser.add_argument("--owner", type=str, required=False, + help="GitHub repo owner/org (for the --skip-unchanged feature)") + argparser.add_argument("--repo", type=str, required=False, + help="GitHub repo name (for the --skip-unchanged feature)") + argparser.add_argument("--pr-number", type=int, required=False, + help="PR number under owner/repo (for the --skip-unchanged feature)") + argparser.add_argument("--skip-unchanged", type=bool, required=False, default=False, + action=argparse.BooleanOptionalAction) + args = argparser.parse_args() + # https://www.gnu.org/software/make/manual/make.html#Reading-Makefiles with open("Makefile", "rt") as makefile: lines = read_makefile_lines(makefile) @@ -148,6 +170,10 @@ def main() -> None: write_github_workflow_file(tree, project_dir / ".github" / "workflows" / "build-notebooks.yaml") leafs = compute_leafs_in_dependency_tree(tree) + if args.skip_unchanged: + logging.info(f"Skipping targets not modified in PR #{args.pr_number}") + changed_files = gha_pr_changed_files.list_changed_files(args.owner, args.repo, args.pr_number) + leafs = gha_pr_changed_files.filter_out_unchanged(leafs, changed_files) output = print_github_actions_pr_matrix(tree, leafs) print("leafs", leafs) @@ -159,3 +185,18 @@ def main() -> None: if __name__ == '__main__': main() + + +class SelfTests(unittest.TestCase): + def test_select_changed_targets(self): + with open(project_dir / "Makefile", "rt") as makefile: + lines = read_makefile_lines(makefile) + tree = extract_target_dependencies(lines) + leafs = compute_leafs_in_dependency_tree(tree) + + changed_files = ["jupyter/datascience/ubi9-python-3.9/Dockerfile"] + + leafs = gha_pr_changed_files.filter_out_unchanged(leafs, changed_files) + assert set(leafs) == {'cuda-jupyter-tensorflow-ubi9-python-3.9', + 'jupyter-trustyai-ubi9-python-3.9', + 'jupyter-pytorch-ubi9-python-3.9'} diff --git a/ci/cached-builds/gha_pr_changed_files.py b/ci/cached-builds/gha_pr_changed_files.py new file mode 100644 index 000000000..1f3ab56e1 --- /dev/null +++ b/ci/cached-builds/gha_pr_changed_files.py @@ -0,0 +1,126 @@ +import json +import logging +import os +import pathlib +import re +import subprocess +import unittest +import urllib.request + +PROJECT_ROOT = pathlib.Path(__file__).parent.parent.parent.resolve() + + +def get_github_token() -> str: + github_token = os.environ['GITHUB_TOKEN'] + return github_token + + +# https://docs.github.com/en/graphql/guides/forming-calls-with-graphql +def compose_gh_api_request(pull_number: int, owner="opendatahub-io", repo="notebooks", per_page=100, + cursor="") -> urllib.request.Request: + github_token = get_github_token() + + return urllib.request.Request( + url="https://api.github.com/graphql", + method="POST", + headers={ + "Authorization": f"bearer {github_token}", + }, + # https://docs.github.com/en/graphql/guides/using-the-explorer + data=json.dumps({"query": f""" +{{ + repository(owner:"{owner}", name:"{repo}") {{ + pullRequest(number:{pull_number}) {{ + files(first:{per_page}, after:"{cursor}") {{ + edges {{ + node {{ + path + }} + cursor + }} + }} + }} + }} +}} + """}).encode("utf-8"), + ) + + +def list_changed_files(owner: str, repo: str, pr_number: int, per_page=100) -> list[str]: + files = [] + + logging.debug("Getting list of changed files from GitHub API") + + CURSOR = "" + while CURSOR is not None: + request = compose_gh_api_request(pull_number=pr_number, owner=owner, repo=repo, per_page=per_page, + cursor=CURSOR) + response = urllib.request.urlopen(request) + data = json.loads(response.read().decode("utf-8")) + response.close() + edges = data["data"]["repository"]["pullRequest"]["files"]["edges"] + + CURSOR = None + for edge in edges: + files.append(edge["node"]["path"]) + CURSOR = edge["cursor"] + + logging.debug(f"Determined {len(files)} changed files: {files[:5]} (..., printing up to 5)") + return files + + +def analyze_build_directories(make_target) -> list[str]: + directories = [] + + pattern = re.compile(r"#\*# Image build directory: <(?P[^>]+)> #\(MACHINE-PARSED LINE\)#\*#\.\.\.") + try: + logging.debug(f"Running make in --just-print mode for target {make_target}") + for line in subprocess.check_output(["make", make_target, "--just-print"], encoding="utf-8", + cwd=PROJECT_ROOT).splitlines(): + if m := pattern.match(line): + directories.append(m["dir"]) + except subprocess.CalledProcessError as e: + print(e.stderr, e.stdout) + raise + + logging.debug(f"Target {make_target} depends on files in directories {directories}") + return directories + + +def should_build_target(changed_files: list[str], target_directories: list[str]) -> str: + """Returns truthy if there is at least one changed file necessitating a build. + Falsy (empty) string is returned otherwise.""" + for directory in target_directories: + for changed_file in changed_files: + if changed_file.startswith(directory): + return changed_file + return "" + + +def filter_out_unchanged(targets: list[str], changed_files: list[str]) -> list[str]: + changed = [] + for target in targets: + target_directories = analyze_build_directories(target) + if reason := should_build_target(changed_files, target_directories): + logging.info(f"✅ Will build {target} because file {reason} has been changed") + changed.append(target) + else: + logging.info(f"❌ Won't build {target}") + return changed + + +class SelfTests(unittest.TestCase): + def test_compose_gh_api_request__call_without_asserting(self): + request = compose_gh_api_request(pull_number=556, per_page=100, cursor="") + print(request.data) + + def test_list_changed_files__pagination_works(self): + changed_files = list_changed_files(owner="opendatahub-io", repo="notebooks", pr_number=556, per_page=1) + assert set(changed_files) == {'codeserver/ubi9-python-3.9/Dockerfile', + 'codeserver/ubi9-python-3.9/run-code-server.sh'} + + def test_analyze_build_directories(self): + directories = analyze_build_directories("jupyter-intel-pytorch-ubi9-python-3.9") + assert set(directories) == {"base/ubi9-python-3.9", + "intel/base/gpu/ubi9-python-3.9", + "jupyter/intel/pytorch/ubi9-python-3.9"}