diff --git a/.github/repos-config.json b/.github/repos-config.json new file mode 100644 index 0000000000000..9b021d91546d5 --- /dev/null +++ b/.github/repos-config.json @@ -0,0 +1,229 @@ +{ + "repositories": [ + { + "name": "miopen", + "url": "ROCm/MIOpen", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "tensile", + "url": "ROCm/Tensile", + "branch": "develop", + "category": "shared", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "origami", + "url": "ROCm/origami", + "branch": "develop", + "category": "shared", + "auto_subtree_pull": false, + "auto_subtree_push": false, + "monorepo_source_of_truth": true + }, + { + "name": "mxdatagenerator", + "url": "ROCm/mxDataGenerator", + "branch": "develop", + "category": "shared", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "composablekernel", + "url": "ROCm/composable_kernel", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": true, + "auto_subtree_push": false, + "monorepo_source_of_truth": false + }, + { + "name": "hipblas", + "url": "ROCm/hipBLAS", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipblas-common", + "url": "ROCm/hipBLAS-common", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipblaslt", + "url": "ROCm/hipBLASLt", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipcub", + "url": "ROCm/hipCUB", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipfft", + "url": "ROCm/hipFFT", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hiprand", + "url": "ROCm/hipRAND", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipsolver", + "url": "ROCm/hipSOLVER", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipsparse", + "url": "ROCm/hipSPARSE", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipsparselt", + "url": "ROCm/hipSPARSELt", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocblas", + "url": "ROCm/rocBLAS", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocfft", + "url": "ROCm/rocFFT", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocprim", + "url": "ROCm/rocPRIM", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocrand", + "url": "ROCm/rocRAND", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocroller", + "url": "ROCm/rocRoller", + "branch": "develop", + "category": "shared", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocsolver", + "url": "ROCm/rocSOLVER", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocsparse", + "url": "ROCm/rocSPARSE", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "rocthrust", + "url": "ROCm/rocThrust", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hipdnn", + "url": "ROCm/hipDNN", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": false, + "monorepo_source_of_truth": true + }, + { + "name": "rocwmma", + "url": "ROCm/rocWMMA", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + { + "name": "hiptensor", + "url": "ROCm/hipTensor", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + } + ] +} diff --git a/.github/scripts/apply-labels.py b/.github/scripts/apply-labels.py new file mode 100644 index 0000000000000..980b76009b0ba --- /dev/null +++ b/.github/scripts/apply-labels.py @@ -0,0 +1,69 @@ +import os +import sys +import yaml +import requests + + +def get_existing_labels(repo, token): + headers = {"Authorization": f"token {token}"} + labels = {} + page = 1 + while True: + url = f"https://api.github.com/repos/{repo}/labels?page={page}&per_page=100" + resp = requests.get(url, headers=headers) + if resp.status_code != 200: + raise Exception(f"Failed to fetch existing labels: {resp.text}") + data = resp.json() + if not data: + break + for label in data: + labels[label["name"]] = { + "color": label["color"], + "description": label.get("description", ""), + } + page += 1 + return labels + + +def create_or_update_label(repo, token, label, existing): + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github+json", + } + + if label["name"] not in existing: + # Create label + print(f"Creating label: {label['name']}") + url = f"https://api.github.com/repos/{repo}/labels" + resp = requests.post(url, json=label, headers=headers) + else: + # Update if different + current = existing[label["name"]] + if label["color"].lower() != current["color"].lower() or label.get( + "description", "" + ) != current.get("description", ""): + print(f"Updating label: {label['name']}") + url = f"https://api.github.com/repos/{repo}/labels/{label['name']}" + resp = requests.patch(url, json=label, headers=headers) + else: + print(f"Label '{label['name']}' already up to date. Skipping.") + return + + if not resp.ok: + print(f"Failed to apply label {label['name']}: {resp.status_code} {resp.text}") + + +def main(label_file): + token = os.environ["GH_TOKEN"] + repo = os.environ["GITHUB_REPO"] + existing = get_existing_labels(repo, token) + + with open(label_file, "r") as f: + labels = yaml.safe_load(f) + + for label in labels: + create_or_update_label(repo, token, label, existing) + + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/.github/scripts/azure_resolve_subtree_deps.py b/.github/scripts/azure_resolve_subtree_deps.py new file mode 100644 index 0000000000000..5510c7afa9a61 --- /dev/null +++ b/.github/scripts/azure_resolve_subtree_deps.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +""" +Azure Pipeline Resolver Script +------------------------------ +This script determines which Azure pipelines to run based on changed subtrees. +Using a predefined dependency map, the script resolves which projects need to be processed, +skipping those that will be covered by their dependencies. + +Steps: + 1. Load a list of changed projects from a file. + 2. Consult a dependency map to determine transitive and direct dependencies. + 3. Identify projects that should be processed, excluding those handled by dependencies. + 4. Output the list of projects to be run, along with their Azure pipeline IDs. + +Arguments: + --subtree-file : Path to the file containing a newline-separated list of changed subtrees. + +Outputs: + Prints a newline-separated list of "project_name=definition_id" for the projects that need + to be processed, where `definition_id` is the Azure pipeline ID associated with the project. + +Example Usage: + To determine which pipelines to run given the changed subtrees listed in a file: + python azure_pipeline_resolver.py --subtree-file changed_subtrees.txt +""" + +import argparse +from typing import List, Optional + + +def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Given a list of changed subtrees, determine which Azure pipelines to run." + ) + parser.add_argument( + "--subtree-file", + required=True, + help="Path to the file containing changed subtrees", + ) + return parser.parse_args(argv) + + +def read_file_into_set(file_path): + """Reads the project names from the file into a set.""" + with open(file_path, "r") as file: + return {line.strip() for line in file} + + +def resolve_dependencies(projects, dependencies): + """Resolves projects to be run by checking all levels of dependencies.""" + + def has_dependency(project, projects_set): + """Recursively checks if a project has any dependencies in the projects_set.""" + if project not in dependencies: + return False + for dependency in dependencies[project]: + if dependency in projects_set or has_dependency(dependency, projects_set): + return True + return False + + projects_to_run = set(projects) + + for project in projects: + if has_dependency(project, projects_to_run): + projects_to_run.discard(project) + + return projects_to_run + + +def main(argv=None) -> None: + """Main function to process the projects and output those to be run.""" + # Mathlib build+test dependency tree as defined in Azure CI and TheRock + math_dependencies = { + "shared/tensile": {}, + "shared/origami": {}, + "projects/rocrand": {}, + "projects/hiprand": {"projects/rocrand"}, + "projects/rocfft": {"projects/hiprand"}, + "projects/hipfft": {"projects/rocfft"}, + "projects/rocprim": {}, + "projects/hipcub": {"projects/rocprim"}, + "projects/rocthrust": {"projects/rocprim"}, + "projects/hipblas-common": {}, + "projects/hipblaslt": {"projects/hipblas-common"}, + "projects/rocblas": {"projects/hipblaslt"}, + "projects/rocsolver": {"projects/rocprim", "projects/rocblas"}, + "projects/rocsparse": {"projects/rocprim", "projects/rocblas"}, + "projects/hipblas": {"projects/rocsolver"}, + "projects/hipsolver": {"projects/rocsolver", "projects/rocsparse"}, + "projects/hipsparse": {"projects/rocsparse"}, + "projects/hipsparselt": {"projects/hipsparse"}, + "projects/miopen": {"projects/rocrand", "projects/hipblas"}, + "projects/hiptensor": {}, + } + # Azure pipeline IDs for each project, to be populated as projects are enabled + definition_ids = { + "shared/tensile": 305, + "projects/rocrand": 274, + "projects/hiprand": 275, + "projects/rocfft": 282, + "projects/hipfft": 283, + "projects/rocprim": 273, + "projects/hipcub": 277, + "projects/rocthrust": 276, + "projects/hipblas-common": 300, + "projects/hipblaslt": 301, + "projects/rocblas": 302, + "projects/rocsolver": 303, + "projects/rocsparse": 314, + "projects/hipblas": 317, + "projects/hipsolver": 322, + "projects/hipsparse": 315, + "projects/hipsparselt": 309, + "projects/miopen": 320, + "shared/origami": 364, + "projects/rocwmma": 370, + "projects/hiptensor": 374, + } + + args = parse_arguments(argv) + projects = read_file_into_set(args.subtree_file) + projects_to_run = resolve_dependencies(projects, math_dependencies) + + for project in projects_to_run: + if project in definition_ids: + print(f"{project}={definition_ids[project]}") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/collect-labels.py b/.github/scripts/collect-labels.py new file mode 100644 index 0000000000000..b459419c8b0ec --- /dev/null +++ b/.github/scripts/collect-labels.py @@ -0,0 +1,51 @@ +import json +import os +import sys +import requests +import yaml + + +def get_labels(repo, token): + headers = {"Authorization": f"token {token}"} + labels = [] + page = 1 + while True: + url = f"https://api.github.com/repos/{repo}/labels?page={page}&per_page=100" + resp = requests.get(url, headers=headers) + if resp.status_code != 200: + raise Exception(f"Failed to fetch labels from {repo}: {resp.text}") + data = resp.json() + if not data: + break + labels.extend(data) + page += 1 + return labels + + +def main(file_path): + with open(file_path, "r") as f: + repos_data = json.load(f)["repositories"] + + token = os.environ["GH_TOKEN"] + all_labels = {} + + for repo_entry in repos_data: + repo_url = repo_entry["url"] + print(f"Collecting labels from {repo_url}") + for label in get_labels(repo_url, token): + name = label["name"] + if name not in all_labels: + all_labels[name] = { + "name": name, + "color": label["color"], + "description": label.get("description", ""), + } + + sorted_labels = sorted(all_labels.values(), key=lambda l: l["name"].lower()) + os.makedirs(".github", exist_ok=True) # Ensure the .github directory exists + with open(".github/labels.yml", "w") as out: + yaml.dump(sorted_labels, out, sort_keys=False) + + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/.github/scripts/config_loader.py b/.github/scripts/config_loader.py new file mode 100644 index 0000000000000..b2940055d7d2d --- /dev/null +++ b/.github/scripts/config_loader.py @@ -0,0 +1,19 @@ +import json +import sys +import logging +from typing import List +from repo_config_model import RepoConfig, RepoEntry + +logger = logging.getLogger(__name__) + + +def load_repo_config(config_path: str) -> List[RepoEntry]: + """Load and validate repository config from JSON using Pydantic.""" + try: + with open(config_path, "r", encoding="utf-8") as f: + data = json.load(f) + config = RepoConfig(**data) + return config.repositories + except Exception as e: + logger.error(f"Failed to load or validate config file '{config_path}': {e}") + sys.exit(1) diff --git a/.github/scripts/github_cli_client.py b/.github/scripts/github_cli_client.py new file mode 100644 index 0000000000000..04cfe6c5c77e2 --- /dev/null +++ b/.github/scripts/github_cli_client.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 + +""" +GitHub Client Utility +--------------------- +This utility provides a GitHubClient class that wraps GitHub REST API operations +used across automation scripts, such as retrieving pull request file changes and labels. + +When doing manual testing, you can run the same REST API calls through curl in the terminal. +These REST API URLs, without the authentication header, will be output by the debug logging. + +This includes: +- Fetching PR details +- Creating PRs +- Closing PRs + +Requirements: + - NOTE: GH_TOKEN environment variable hands authentication token to this script in a runner. + - The token is created by the GitHub App and is passed to the script via the environment variable. + +Manual curl testing: + +To fetch PR details: + curl -H "Authorization: Bearer $GH_TOKEN" -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/OWNER/REPO/pulls/NUMBER + +To list PRs by head branch: + curl -H "Authorization: Bearer $GH_TOKEN" -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/OWNER/REPO/pulls?head=OWNER:branch-name&state=open" + +To fetch changed files in a PR: + curl -H "Authorization: Bearer $GH_TOKEN" -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/OWNER/REPO/pulls/NUMBER/files + +To create a PR: + curl -X POST -H "Authorization: Bearer $GH_TOKEN" -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/OWNER/REPO/pulls \ + -d '{"title":"Title","body":"Description","head":"branch-name","base":"main"}' + +To apply labels: + curl -X POST -H "Authorization: Bearer $GH_TOKEN" -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/OWNER/REPO/issues/NUMBER/labels \ + -d '{"labels": ["bug", "needs-review"]}' +""" + +import os +import requests +import time +import logging +from typing import List, Optional + +logger = logging.getLogger(__name__) + + +class GitHubCLIClient: + def __init__(self) -> None: + """Initialize the GitHub API client using GitHub App authentication.""" + self.api_url = "https://api.github.com" + self.session = requests.Session() + self.session.headers.update( + { + "Authorization": f"Bearer {self._get_token()}", + "Accept": "application/vnd.github+json", + } + ) + + def _get_token(self) -> str: + """Helper method to retrieve the GitHub token from environment variable.""" + token = os.getenv("GH_TOKEN") + if not token: + raise EnvironmentError("GH_TOKEN environment variable is not set") + return token + + def _get_with_retries( + self, + url: str, + error_msg: str, + retries: int = 3, + backoff: int = 2, + timeout: int = 10, + ) -> Optional[requests.Response]: + """Internal helper to retry a GET request with exponential backoff.""" + # no logging the actual request to avoid leaking sensitive information + for attempt in range(retries): + try: + response = self.session.get(url, timeout=timeout) + if response.status_code == 200: + return response + # for api rate limiting, we check the headers for remaining requests and reset time + elif ( + response.status_code == 403 + and response.headers.get("X-RateLimit-Remaining") == "0" + ): + reset_time = int(response.headers.get("X-RateLimit-Reset", 0)) + sleep_seconds = max(1, reset_time - int(time.time()) + 1) + logger.warning( + f"Rate limited. Sleeping for {sleep_seconds} seconds..." + ) + time.sleep(sleep_seconds) + continue + # other errors will use exponential backoff timeout + elif response.status_code in {403, 429, 500, 502, 503, 504}: + logger.warning( + f"Retryable error {response.status_code} on attempt {attempt}." + ) + else: + response.raise_for_status() + except requests.RequestException as e: + logger.warning(f"Request failed on attempt {attempt}: {e}") + logger.error(f"{error_msg} for {url} (Attempt {attempt + 1}/{retries})") + if attempt < retries - 1: + time.sleep(backoff**attempt) # Exponential backoff + else: + logger.error(f"Max retries reached for GET at {url}. Giving up.") + return None + + def _get_json(self, url: str, error_msg: str) -> dict: + """Helper method to perform a simple GET request and return a single JSON object.""" + response = self._get_with_retries(url, error_msg) + return response.json() if response else {} + + def _get_paginated_json(self, url: str, error_msg: str) -> List[dict]: + """Helper method to perform a sequence of GET requests with pagination.""" + results = [] + while url: + response = self._get_with_retries(url, error_msg) + if not response: + return results + results.extend(response.json()) + url = response.links.get("next", {}).get("url") + return results + + def _request_json( + self, + method: str, + url: str, + json: Optional[dict] = None, + error_msg: str = "", + retries: int = 3, + backoff: int = 2, + ) -> dict: + """Helper method to perform a request with retries and return JSON response.""" + # no logging the actual request to avoid leaking sensitive information + for attempt in range(retries): + response = self.session.request(method, url, json=json) + if response.ok: + if response.status_code == 204 or not response.text.strip(): + return {} # DELETE requests have no json content + else: + return response.json() + else: + # for api rate limiting, we check the headers for remaining requests and reset time + if ( + response.status_code == 403 + and response.headers.get("X-RateLimit-Remaining") == "0" + ): + reset_time = int(response.headers.get("X-RateLimit-Reset", 0)) + sleep_seconds = max(1, reset_time - int(time.time()) + 1) + logger.warning( + f"Rate limited. Sleeping for {sleep_seconds} seconds..." + ) + time.sleep(sleep_seconds) + # other errors will use exponential backoff timeout + else: + logger.error( + f"{error_msg} for method {method} at {url} (Attempt {attempt + 1}/{retries})" + ) + if attempt < retries - 1: + time.sleep(backoff**attempt) # Exponential backoff + else: + logger.error( + f"Max retries reached for method {method} at {url}. Giving up." + ) + return {} + + def get_changed_files(self, repo: str, pr: int) -> List[str]: + """Fetch the changed files in a pull request using GitHub API.""" + url = f"{self.api_url}/repos/{repo}/pulls/{pr}/files?per_page=50" + logger.debug(f"Request URL: {url}") + files_data = self._get_paginated_json( + url, f"Failed to fetch files for PR #{pr} in {repo}" + ) + files = [file["filename"] for file in files_data] + logger.debug(f"Changed files in PR #{pr}: {files}") + return files + + def get_defined_labels(self, repo: str) -> List[str]: + """Get all labels defined in the given repository.""" + url = f"{self.api_url}/repos/{repo}/labels?per_page=100" + logger.debug(f"Request URL: {url}") + labels_data = self._get_paginated_json( + url, f"Failed to fetch labels from {repo}" + ) + labels = [label["name"] for label in labels_data] + logger.debug(f"Defined labels in {repo}: {labels}") + return labels + + def get_existing_labels_on_pr(self, repo: str, pr: int) -> List[str]: + """Fetch current labels on a PR.""" + url = f"{self.api_url}/repos/{repo}/issues/{pr}/labels?per_page=100" + logger.debug(f"Request URL: {url}") + labels_data = self._get_paginated_json( + url, f"Failed to fetch labels for PR #{pr} in {repo}" + ) + labels = [label["name"] for label in labels_data] + logger.debug(f"Existing labels on PR #{pr}: {labels}") + return labels + + def pr_view(self, repo: str, head: str) -> Optional[int]: + """Check if a PR exists for the given repo and branch.""" + # This is similar to get_pr_by_head_branch but returns only the PR number directly + url = f"{self.api_url}/repos/{repo}/pulls?head={repo.split('/')[0]}:{head}&per_page=100" + logger.debug(f"Request URL: {url}") + result = self._get_paginated_json( + url, f"Failed to retrieve PR for head branch {head} in repo {repo}" + ) + return result[0]["number"] if result else None + + def get_pr_by_head_branch(self, repo: str, head: str) -> Optional[dict]: + """Fetch the PR object for a given head branch in a repository, if it exists.""" + # This is similar to pr_view but returns the full PR object + url = f"{self.api_url}/repos/{repo}/pulls?head={repo.split('/')[0]}:{head}&state=open&per_page=100" + logger.debug(f"Request URL: {url}") + data = self._get_paginated_json( + url, f"Failed to get PRs for {repo} with head {head}" + ) + return data[0] if data else None + + def get_pr_by_number(self, repo: str, pr_number: int) -> Optional[dict]: + """Fetch the PR object for a given PR number in a repository.""" + url = f"{self.api_url}/repos/{repo}/pulls/{pr_number}" + logger.debug(f"Fetching PR #{pr_number} from {repo}") + response = self._get_json(url, f"Failed to get PR #{pr_number} from {repo}") + return response + + def pr_create( + self, + repo: str, + base: str, + head: str, + title: str, + body: str, + dry_run: bool = False, + ) -> None: + """Create a new pull request.""" + url = f"{self.api_url}/repos/{repo}/pulls" + payload = {"title": title, "body": body, "head": head, "base": base} + logger.debug(f"Request URL: {url}") + logger.debug(f"Request Payload: {payload}") + if dry_run: + logger.info( + f"Dry run: The pull request would be created from {head} to {base} in {repo}" + ) + return + self._request_json( + "POST", url, payload, f"Failed to create PR from {head} to {base} in {repo}" + ) + logger.info(f"Created PR from {head} to {base} in {repo}.") + + def close_pr_and_delete_branch( + self, repo: str, pr_number: int, dry_run: bool = False + ) -> None: + """Close a pull request and delete the associated branch using the GitHub API.""" + pr_url = f"{self.api_url}/repos/{repo}/pulls/{pr_number}" + logger.debug(f"Request URL: {pr_url}") + pr_data = self._get_json(pr_url, f"Failed to fetch PR #{pr_number} in {repo}") + head_ref = pr_data.get("head", {}).get("ref") + if not head_ref: + logger.error( + f"Could not determine head branch for PR #{pr_number} in {repo}" + ) + return + logger.debug(f"PR #{pr_number} head branch: {head_ref}") + close_payload = {"state": "closed"} + logger.debug(f"Request Payload: {close_payload}") + if dry_run: + logger.info( + f"Dry run: The pull request #{pr_number} would be closed and the branch '{head_ref}' would be deleted in repo '{repo}'" + ) + return + self._request_json( + "PATCH", pr_url, close_payload, f"Failed to close PR #{pr_number} in {repo}" + ) + branch_url = f"{self.api_url}/repos/{repo}/git/refs/heads/{head_ref}" + logger.debug(f"Branch DELETE URL: {branch_url}") + self._request_json( + "DELETE", + branch_url, + None, + f"Failed to delete branch '{head_ref}' for PR #{pr_number}", + ) + logger.info( + f"Closed pull request #{pr_number} and deleted the branch '{head_ref}' in {repo}." + ) + + def sync_labels( + self, target_repo: str, pr_number: int, labels: List[str], dry_run: bool = False + ) -> None: + """Sync labels from the source repo to the target repo (only apply existing labels).""" + url = f"{self.api_url}/repos/{target_repo}/labels?per_page=100" + logger.debug(f"Request URL: {url}") + target_repo_labels = { + label["name"] + for label in self._get_paginated_json( + url, f"Failed to fetch labels for {target_repo}" + ) + } + labels_set = set(labels) + labels_to_apply = labels_set & target_repo_labels + labels_for_logging = ",".join(labels_to_apply) + if labels_to_apply: + # note: using issues endpoint for labels as PRs are a subset of issues + url = f"{self.api_url}/repos/{target_repo}/issues/{pr_number}/labels" + payload = {"labels": list(labels_to_apply)} + logger.debug(f"Request URL: {url}") + logger.debug(f"Request Payload: {payload}") + if not dry_run: + self._request_json( + "POST", + url, + payload, + f"Failed to apply labels to PR #{pr_number} in {target_repo}", + ) + logger.info( + f"Applied labels '{labels_for_logging}' to PR #{pr_number} in {target_repo}." + ) + else: + logger.info( + f"Dry run: Labels '{labels_for_logging}' would be applied to PR #{pr_number} in {target_repo}." + ) + else: + logger.info( + f"No valid labels to apply to PR #{pr_number} in {target_repo}." + ) + + def get_squash_merge_commit(self, repo: str, pr_number: int) -> Optional[str]: + """Get the squash merge commit SHA of a merged pull request.""" + url = f"{self.api_url}/repos/{repo}/pulls/{pr_number}" + logger.debug(f"Request URL: {url}") + data = self._get_json(url, f"Failed to fetch PR #{pr_number} from {repo}") + if not data: + logger.error(f"No data returned for PR #{pr_number}") + return None + if data.get("merged") and data.get("merge_commit_sha"): + logger.debug(f"PR #{pr_number} merged commit: {data['merge_commit_sha']}") + return data["merge_commit_sha"] + logger.warning(f"PR #{pr_number} is not merged or missing merge commit SHA.") + return None + + def get_user(self, username: str) -> tuple[str, str]: + """Fetch the name and email of a GitHub user. Falls back to login and no-reply email.""" + url = f"{self.api_url}/users/{username}" + logger.debug(f"Fetching user profile for @{username}") + data = self._get_json(url, f"Failed to fetch user profile for @{username}") + name = data.get("name") or username + email = data.get("email") + if not email: + user_id = data.get("id") + if user_id: + email = f"{user_id}+{username}@users.noreply.github.com" + else: + email = f"{username}@users.noreply.github.com" + return name, email diff --git a/.github/scripts/merge-codeowners.py b/.github/scripts/merge-codeowners.py new file mode 100644 index 0000000000000..4b64ea61c4266 --- /dev/null +++ b/.github/scripts/merge-codeowners.py @@ -0,0 +1,54 @@ +import os +from pathlib import Path + +# Determine monorepo root and output CODEOWNERS path +monorepo_root = Path(__file__).resolve().parents[2] +output_path = monorepo_root / ".github" / "CODEOWNERS" + +merged_entries = [] + +# Walk top-level directories (excluding .github/.git/etc.) +for subdir in monorepo_root.iterdir(): + if subdir.name.startswith(".") or not subdir.is_dir(): + continue + + # Look for CODEOWNERS in root or .github directory of the submodule + candidates = [subdir / "CODEOWNERS", subdir / ".github" / "CODEOWNERS"] + + for codeowners_file in candidates: + if codeowners_file.is_file(): + with codeowners_file.open("r") as f: + for line in f: + stripped = line.strip() + + # Skip empty lines or comments + if not stripped or stripped.startswith("#"): + continue + + parts = stripped.split() + if not parts: + continue + + original_path = parts[0] + owners = " ".join(parts[1:]) + + # Ensure prefixed path starts with a single slash + prefixed_path = ( + f"/{subdir.name.rstrip('/')}{original_path}" + if original_path.startswith("/") + else f"/{subdir.name}/{original_path}" + ) + + merged_entries.append(f"{prefixed_path} {owners}") + +# Sort for consistency +merged_entries.sort() + +# Write merged CODEOWNERS file +output_path.parent.mkdir(parents=True, exist_ok=True) + +with output_path.open("w") as out: + out.write("# Auto-generated CODEOWNERS file\n\n") + out.write("\n".join(merged_entries)) + +print(f"✅ Merged CODEOWNERS written to {output_path}") diff --git a/.github/scripts/merge-submodules.py b/.github/scripts/merge-submodules.py new file mode 100644 index 0000000000000..43b525af1bd35 --- /dev/null +++ b/.github/scripts/merge-submodules.py @@ -0,0 +1,38 @@ +import os +import configparser +from pathlib import Path + +ROOT_DIR = Path(__file__).resolve().parents[2] # Assuming script is in .github/scripts/ +OUTPUT_FILE = ROOT_DIR / ".gitmodules" +MODULE_FILES = list(ROOT_DIR.glob("*/.gitmodules")) + list( + ROOT_DIR.glob("*/.github/.gitmodules") +) + +combined = configparser.ConfigParser() +combined.optionxform = str # Preserve case sensitivity + +for module_file in MODULE_FILES: + subdir = module_file.parent.name + local_config = configparser.ConfigParser() + local_config.optionxform = str + local_config.read(module_file) + + for section in local_config.sections(): + if section.startswith("submodule "): + name = section.split('"')[1] + new_name = f"{subdir}/{name}" + new_section = f'submodule "{new_name}"' + + combined[new_section] = {} + for key, value in local_config[section].items(): + if key == "path": + value = f"{subdir}/{value}" + combined[new_section][key] = value + +# Write combined .gitmodules +with OUTPUT_FILE.open("w") as f: + for section in combined.sections(): + f.write(f"[{section}]\n") + for key, value in combined[section].items(): + f.write(f"\t{key} = {value}\n") + f.write("\n") diff --git a/.github/scripts/pr_category_label.py b/.github/scripts/pr_category_label.py new file mode 100644 index 0000000000000..fa0cfefb8711c --- /dev/null +++ b/.github/scripts/pr_category_label.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +""" +PR Category Label Script +-------------------- +This script analyzes the file paths changed in a pull request and determines which +category labels should be added or removed based on the modified files. + +It uses GitHub's cli to fetch the changed files and the existing labels on the pull request. +Then, it computes the desired labels based on file paths, compares them to the existing labels, +and applies the necessary additions and removals unless in dry-run mode. + +Arguments: + --repo : Full repository name (e.g., org/repo) + --pr : Pull request number + --dry-run : If set, will only log actions without making changes. + --debug : If set, enables detailed debug logging. + +Outputs: + Writes 'add' and 'remove' keys to the GitHub Actions $GITHUB_OUTPUT file, which + the workflow reads to apply label changes using the GitHub CLI. + +Example Usage: + To run in debug mode and perform a dry-run (no changes made): + python pr_auto_label.py --repo ROCm/rocm-libraries --pr --dry-run --debug + To run in debug mode and apply label changes: + python pr_auto_label.py --repo ROCm/rocm-libraries --pr --debug +""" + +import argparse +import json +import logging +import os +import sys +from pathlib import Path +from typing import List, Optional +from github_cli_client import GitHubCLIClient + +logger = logging.getLogger(__name__) + + +def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Apply labels based on PR's changed files." + ) + parser.add_argument( + "--repo", required=True, help="Full repository name (e.g., org/repo)" + ) + parser.add_argument("--pr", required=True, type=int, help="Pull request number") + parser.add_argument( + "--dry-run", + action="store_true", + help="Print results without writing to GITHUB_OUTPUT.", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + return parser.parse_args(argv) + + +def compute_desired_labels(file_paths: list) -> set: + """Determine the desired labels based on the changed files.""" + desired_labels = set() + for path in file_paths: + parts = Path(path).parts + if len(parts) >= 2: + if parts[0] == "projects": + desired_labels.add(f"project: {parts[1]}") + elif parts[0] == "shared": + desired_labels.add(f"shared: {parts[1]}") + logger.debug(f"Desired labels based on changes: {desired_labels}") + return desired_labels + + +def output_labels( + existing_labels: List[str], desired_labels: List[str], dry_run: bool +) -> None: + """Output the labels to add/remove to GITHUB_OUTPUT or log them in dry-run mode.""" + to_add = sorted(desired_labels - set(existing_labels)) + logger.debug(f"Labels to add: {to_add}") + if dry_run: + logger.info("Dry run enabled. Labels will not be applied.") + else: + output_file = os.environ.get("GITHUB_OUTPUT") + if output_file: + with open(output_file, "a") as f: + print(f"label_add={','.join(to_add)}", file=f) + logger.info(f"Wrote to GITHUB_OUTPUT: add={','.join(to_add)}") + else: + print( + "GITHUB_OUTPUT environment variable not set. Outputs cannot be written." + ) + sys.exit(1) + + +def main(argv=None) -> None: + """Main function to execute the PR auto label logic.""" + args = parse_arguments(argv) + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + client = GitHubCLIClient() + changed_files = [file for file in client.get_changed_files(args.repo, int(args.pr))] + + if not changed_files: + logger.warning( + "REST API failed or returned no changed files. Falling back to SHA-based Git diff..." + ) + try: + pr_data = os.popen(f"gh api repos/{args.repo}/pulls/{args.pr}").read() + pr = json.loads(pr_data) + base_sha = pr["base"]["sha"] + head_sha = pr["head"]["sha"] + logger.debug(f"Base SHA: {base_sha}, Head SHA: {head_sha}") + os.system(f"git fetch origin {base_sha} {head_sha}") + result = os.popen(f"git diff --name-only {base_sha} {head_sha}").read() + changed_files = result.strip().splitlines() + logger.info(f"Fallback changed files (SHA-based): {changed_files}") + except Exception as e: + logger.error(f"SHA-based Git CLI fallback failed: {e}") + sys.exit(1) + + existing_labels = client.get_existing_labels_on_pr(args.repo, int(args.pr)) + desired_labels = compute_desired_labels(changed_files) + output_labels(existing_labels, desired_labels, args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/pr_detect_changed_subtrees.py b/.github/scripts/pr_detect_changed_subtrees.py new file mode 100644 index 0000000000000..5030e9cc45d6f --- /dev/null +++ b/.github/scripts/pr_detect_changed_subtrees.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 + +""" +PR Detect Changed Subtrees Script +--------------------------------- +This script analyzes a pull request's changed files and determines which subtrees +(defined in .github/repos-config.json by category/name) were affected. + +Steps: + 1. Fetch the changed files in the PR using the GitHub API. + 2. Load the subtree mapping from repos-config.json. + 3. Match changed paths against known category/name prefixes. + 4. Emit a new-line separated list of changed subtrees to GITHUB_OUTPUT as 'subtrees'. + +Arguments: + --repo : Full repository name (e.g., org/repo) + --pr : Pull request number + --config : OPTIONAL, path to the repos-config.json file. + --require-auto-pull : If set, only include entries with auto_subtree_pull=true. + --require-auto-push : If set, only include entries with auto_subtree_push=true. + --require-monorepo-source : If set, only include entries with monorepo_source_of_truth=true. + --dry-run : If set, will only log actions without making changes. + --debug : If set, enables detailed debug logging. + +Outputs: + Writes 'subtrees' key to the GitHub Actions $GITHUB_OUTPUT file, which + the workflow reads to pass paths to the checkout stages. + The output is a new-line separated list of subtrees in `category/name` format. + +Example Usage: + To run in auto-push situations in debug mode and perform a dry-run (no changes made): + python pr_detect_changed_subtrees.py --repo ROCm/rocm-libraries --pr 123 --require-auto-push --debug --dry-run +""" + +import argparse +import json +import logging +import os +import sys +from typing import List, Optional, Set +from github_cli_client import GitHubCLIClient +from repo_config_model import RepoEntry +from config_loader import load_repo_config + +logger = logging.getLogger(__name__) + + +def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Detect changed subtrees in a PR.") + parser.add_argument( + "--repo", required=True, help="Full repository name (e.g., org/repo)" + ) + parser.add_argument("--pr", required=True, type=int, help="Pull request number") + parser.add_argument( + "--config", + required=False, + default=".github/repos-config.json", + help="Path to the repos-config.json file", + ) + parser.add_argument( + "--require-auto-pull", + action="store_true", + help="Only include entries with auto_subtree_pull=true", + ) + parser.add_argument( + "--require-auto-push", + action="store_true", + help="Only include entries with auto_subtree_push=true", + ) + parser.add_argument( + "--require-monorepo-source", + action="store_true", + help="Only include entries with monorepo_source_of_truth=true", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print results without writing to GITHUB_OUTPUT.", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + return parser.parse_args(argv) + + +def get_valid_prefixes( + config: List[RepoEntry], + require_auto_pull: bool = False, + require_auto_push: bool = False, + require_monorepo_source: bool = False, +) -> Set[str]: + """Extract valid subtree prefixes from the configuration based on filters.""" + valid_prefixes = set() + for entry in config: + if require_auto_pull and not getattr(entry, "auto_subtree_pull", False): + continue + if require_auto_push and not getattr(entry, "auto_subtree_push", False): + continue + if require_monorepo_source and not getattr( + entry, "monorepo_source_of_truth", False + ): + continue + valid_prefixes.add(f"{entry.category}/{entry.name}") + logger.debug("Valid subtrees:\n" + "\n".join(sorted(valid_prefixes))) + return valid_prefixes + + +def find_matched_subtrees( + changed_files: List[str], valid_prefixes: Set[str] +) -> List[str]: + """Find subtrees that match the changed files.""" + changed_subtrees = { + "/".join(path.split("/", 2)[:2]) + for path in changed_files + if len(path.split("/")) >= 2 + } + matched = sorted(changed_subtrees & valid_prefixes) + skipped = sorted(changed_subtrees - valid_prefixes) + if skipped: + logger.debug(f"Skipped subtrees: {skipped}") + logger.debug(f"Matched subtrees: {matched}") + return matched + + +def output_subtrees(matched_subtrees: List[str], dry_run: bool) -> None: + """Output the matched subtrees to GITHUB_OUTPUT or log them in dry-run mode.""" + newline_separated = "\n".join(matched_subtrees) + if dry_run: + logger.info(f"[Dry-run] Would output:\n{newline_separated}") + else: + output_file = os.environ.get("GITHUB_OUTPUT") + if output_file: + with open(output_file, "a") as f: + print(f"subtrees< None: + """Main function to determine changed subtrees in PR.""" + args = parse_arguments(argv) + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + client = GitHubCLIClient() + config = load_repo_config(args.config) + changed_files = client.get_changed_files(args.repo, int(args.pr)) + + if not changed_files: + logger.warning( + "REST API failed or returned no changed files. Falling back to SHA-based Git diff..." + ) + try: + pr_data = os.popen(f"gh api repos/{args.repo}/pulls/{args.pr}").read() + pr = json.loads(pr_data) + base_sha = pr["base"]["sha"] + head_sha = pr["head"]["sha"] + logger.debug(f"Base SHA: {base_sha}, Head SHA: {head_sha}") + os.system(f"git fetch origin {base_sha} {head_sha}") + result = os.popen(f"git diff --name-only {base_sha} {head_sha}").read() + changed_files = result.strip().splitlines() + logger.info(f"Fallback changed files (SHA-based): {changed_files}") + except Exception as e: + logger.error(f"SHA-based Git CLI fallback failed: {e}") + sys.exit(1) + + valid_prefixes = get_valid_prefixes( + config, + args.require_auto_pull, + args.require_auto_push, + args.require_monorepo_source, + ) + matched_subtrees = find_matched_subtrees(changed_files, valid_prefixes) + output_subtrees(matched_subtrees, args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/pr_merge_sync_patches.py b/.github/scripts/pr_merge_sync_patches.py new file mode 100644 index 0000000000000..c197e881ec6c2 --- /dev/null +++ b/.github/scripts/pr_merge_sync_patches.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +""" +Sync Patches to Subrepositories +------------------------------- + +This script is part of the monorepo synchronization system. It runs after a monorepo pull request +is merged and applies relevant changes to the corresponding sub-repositories using Git patches. + +- Uses the merge commit of the monorepo PR to extract subtree changes. +- Generates patch files per changed subtree. +- Applies each patch to its respective sub-repository, adjusting for subtree prefix. +- Uses the repos-config.json file to map subtrees to sub-repos. +- Assumes this script is run from the root of the monorepo. + +Arguments: + --repo : Full repository name (e.g., org/repo) + --pr : Pull request number + --subtrees : A newline-separated list of subtree paths in category/name format (e.g., projects/rocBLAS) + --config : OPTIONAL, path to the repos-config.json file + --dry-run : If set, will only log actions without making changes. + --debug : If set, enables detailed debug logging. + +Example Usage: + python pr_merge_sync_patches.py --repo ROCm/rocm-libraries --pr 123 --subtrees "$(printf 'projects/rocBLAS\nprojects/hipBLASLt\nshared/rocSPARSE')" --dry-run --debug +""" + +import argparse +import logging +import os +import re +import subprocess +import tempfile +from typing import Optional, List +from pathlib import Path +from github_cli_client import GitHubCLIClient +from config_loader import load_repo_config +from repo_config_model import RepoEntry + +logger = logging.getLogger(__name__) + + +def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Apply subtree patches to sub-repositories." + ) + parser.add_argument( + "--repo", required=True, help="Full repository name (e.g., org/repo)" + ) + parser.add_argument("--pr", required=True, type=int, help="Pull request number") + parser.add_argument( + "--subtrees", + required=True, + help="Newline-separated list of changed subtrees (category/name)", + ) + parser.add_argument( + "--config", + required=False, + default=".github/repos-config.json", + help="Path to the repos-config.json file", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, only logs actions without making changes.", + ) + parser.add_argument( + "--debug", action="store_true", help="If set, enables detailed debug logging." + ) + return parser.parse_args(argv) + + +def get_subtree_info(config: List[RepoEntry], subtrees: List[str]) -> List[RepoEntry]: + """Return config entries matching the given subtrees in category/name format.""" + requested = set(subtrees) + matched = [ + entry for entry in config if f"{entry.category}/{entry.name}" in requested + ] + missing = requested - {f"{e.category}/{e.name}" for e in matched} + if missing: + logger.warning( + f"Some subtrees not found in config: {', '.join(sorted(missing))}" + ) + return matched + + +def _run_git(args: List[str], cwd: Optional[Path] = None) -> str: + """Run a git command and return stdout.""" + cmd = ["git"] + args + logger.debug(f"Running git command: {' '.join(cmd)} (cwd={cwd})") + result = subprocess.run( + cmd, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode != 0: + logger.error(f"Git command failed: {' '.join(cmd)}\n{result.stderr}") + raise RuntimeError(f"Git command failed: {' '.join(cmd)}\n{result.stderr}") + return result.stdout.strip() + + +def _clone_subrepo(repo_url: str, branch: str, destination: Path) -> None: + """Clone a specific branch from the given GitHub repository into the destination path.""" + _run_git( + [ + "clone", + "--branch", + branch, + "--single-branch", + f"https://github.com/{repo_url}", + str(destination), + ] + ) + logger.debug(f"Cloned {repo_url} into {destination}") + + +def _configure_git_user(repo_path: Path) -> None: + """Configure git user.name and user.email for the given repository directory.""" + _run_git(["config", "user.name", "assistant-librarian[bot]"], cwd=repo_path) + _run_git( + ["config", "user.email", "assistant-librarian[bot]@users.noreply.github.com"], + cwd=repo_path, + ) + + +def _apply_patch(repo_path: Path, patch_path: Path) -> None: + """Apply a patch file to the working tree.""" + _run_git(["apply", str(patch_path)], cwd=repo_path) + logger.info(f"Applied patch to working tree at {repo_path}") + + +def _stage_changes(repo_path: Path) -> None: + """Stage all changes in the repository.""" + _run_git(["add", "."], cwd=repo_path) + logger.debug(f"Staged all changes in {repo_path}") + + +def _extract_commit_message_from_patch(patch_path: Path) -> str: + """Extract and clean the original commit message from the patch file, + removing '[PATCH]' and trailing PR references like (#NN) from the title.""" + with open(patch_path, "r", encoding="utf-8") as f: + lines = f.readlines() + commit_msg_lines = [] + in_msg = False + for line in lines: + if line.startswith("Subject: "): + subject = line[len("Subject: ") :].strip() + # Remove leading "[PATCH]" if present + if subject.startswith("[PATCH]"): + subject = subject[len("[PATCH]") :].strip() + # Remove trailing PR refs like (#NN) + subject = re.sub(r"\s*\(#\d+\)$", "", subject) + commit_msg_lines.append(subject + "\n") + in_msg = True + elif in_msg: + if line.startswith("---"): + break + commit_msg_lines.append(line) + return "".join(commit_msg_lines).strip() + + +def _format_commit_message( + monorepo_url: str, pr_number: int, merge_sha: str, original_msg: str +) -> str: + """Prepend a sync annotation to the original commit message.""" + annotation = ( + f"[rocm-libraries] {monorepo_url}#{pr_number} (commit {merge_sha[:7]})\n\n" + ) + return annotation + original_msg + + +def _commit_changes( + repo_path: Path, message: str, author_name: str, author_email: str +) -> None: + """Commit staged changes with the specified author and message.""" + _run_git( + ["commit", "--author", f"{author_name} <{author_email}>", "-m", message], + cwd=repo_path, + ) + logger.debug(f"Committed changes with author {author_name} <{author_email}>") + + +def _set_authenticated_remote(repo_path: Path, repo_url: str) -> None: + """Set the push URL to use the GitHub App token from GH_TOKEN env.""" + token = os.environ["GH_TOKEN"] + if not token: + raise RuntimeError("GH_TOKEN environment variable is not set") + remote_url = f"https://x-access-token:{token}@github.com/{repo_url}.git" + _run_git(["remote", "set-url", "origin", remote_url], cwd=repo_path) + + +def _push_changes(repo_path: Path, branch: str) -> None: + """Push the commit to origin of branch.""" + _run_git(["push", "origin", branch], cwd=repo_path) + logger.debug(f"Pushed changes from {repo_path} to origin") + + +def generate_patch(prefix: str, merge_sha: str, patch_path: Path) -> None: + """Generate a patch file for a given subtree prefix from a merge commit.""" + args = [ + "format-patch", + "-1", + merge_sha, + f"--relative={prefix}", + "--output", + str(patch_path), + ] + _run_git(args) + logger.debug(f"Generated patch for prefix '{prefix}' at {patch_path}") + + +def resolve_patch_author( + client: GitHubCLIClient, repo: str, pr: int +) -> tuple[str, str]: + """Determine the appropriate author for the patch + Returns: (author_name, author_email)""" + pr_data = client.get_pr_by_number(repo, pr) + body = pr_data.get("body", "") or "" + match = re.search(r"Originally authored by @([A-Za-z0-9_-]+)", body) + if match: + username = match.group(1) + logger.debug(f"Found originally authored username in PR body: @{username}") + else: + username = pr_data["user"]["login"] + logger.debug(f"No explicit original author, using PR author: @{username}") + name, email = client.get_user(username) + return name or username, email + + +def apply_patch_to_subrepo( + entry: RepoEntry, + monorepo_url: str, + monorepo_pr: int, + patch_path: Path, + author_name: str, + author_email: str, + merge_sha: str, + dry_run: bool = False, +) -> None: + """Clone the subrepo, apply the patch, and attribute to the original author with commit message annotations.""" + with tempfile.TemporaryDirectory() as tmpdir: + subrepo_path = Path(tmpdir) / entry.name + _clone_subrepo(entry.url, entry.branch, subrepo_path) + if dry_run: + logger.info( + f"[Dry-run] Would apply patch to {entry.url} as {author_name} <{author_email}>" + ) + return + _configure_git_user(subrepo_path) + _apply_patch(subrepo_path, patch_path) + _stage_changes(subrepo_path) + original_commit_msg = _extract_commit_message_from_patch(patch_path) + commit_msg = _format_commit_message( + monorepo_url, monorepo_pr, merge_sha, original_commit_msg + ) + _commit_changes(subrepo_path, commit_msg, author_name, author_email) + _set_authenticated_remote(subrepo_path, entry.url) + _push_changes(subrepo_path, entry.branch) + logger.info( + f"Patch applied, committed, and pushed to {entry.url} as {author_name} <{author_email}>" + ) + + +def main(argv: Optional[List[str]] = None) -> None: + """Main function to apply patches to sub-repositories.""" + args = parse_arguments(argv) + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + client = GitHubCLIClient() + config = load_repo_config(args.config) + subtrees = [line.strip() for line in args.subtrees.splitlines() if line.strip()] + relevant_subtrees = get_subtree_info(config, subtrees) + merge_sha = client.get_squash_merge_commit(args.repo, args.pr) + logger.debug(f"Merge commit for PR #{args.pr} in {args.repo}: {merge_sha}") + for entry in relevant_subtrees: + prefix = f"{entry.category}/{entry.name}" + logger.debug(f"Processing subtree {prefix}") + with tempfile.TemporaryDirectory() as tmpdir: + patch_file = Path(tmpdir) / f"{entry.name}.patch" + generate_patch(prefix, merge_sha, patch_file) + author_name, author_email = resolve_patch_author(client, args.repo, args.pr) + apply_patch_to_subrepo( + entry, + args.repo, + args.pr, + patch_file, + author_name, + author_email, + merge_sha, + args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/repo_config_model.py b/.github/scripts/repo_config_model.py new file mode 100644 index 0000000000000..34892c25554f6 --- /dev/null +++ b/.github/scripts/repo_config_model.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +""" +Repository Config Model +------------------------ + +This module defines Pydantic data models for validating and parsing the repos-config.json file. + +Structure of the expected JSON: + +{ + "repositories": [ + { + "name": "rocblas", + "url": "ROCm/rocBLAS", + "branch": "develop", + "category": "projects", + "auto_subtree_pull": false, + "auto_subtree_push": true, + "monorepo_source_of_truth": true + }, + ... + ] +} +""" + +from typing import List + +from pydantic import BaseModel + + +class RepoEntry(BaseModel): + """ + Represents a single repository entry in the repos-config.json file. + + Fields: + name : Name of the project matching packaging file names. Lower-cased and no underscores. (e.g., "rocblas") + url : Individual GitHub org plus repo names in matching case and punctuation. (e.g., "ROCm/rocBLAS") + branch : The base branch of the sub-repo to target (e.g., "develop"). + category : Directory category in the monorepo (e.g., "projects" or "shared"). + auto_subtree_pull : Whether to automatically pull updates from the individual repo to the monorepo. + auto_subtree_push : Whether to automatically push changes from the monorepo to the individual repo. + monorepo_source_of_truth : Whether the monorepo is the source of truth for this project. + """ + + name: str + url: str + branch: str + category: str + auto_subtree_pull: bool + auto_subtree_push: bool + monorepo_source_of_truth: bool + + +class RepoConfig(BaseModel): + """ + Represents the full config file structure. + + Fields: + repositories : List of RepoEntry items. + """ + + repositories: List[RepoEntry] diff --git a/.github/scripts/tests/therock_configure_ci_test.py b/.github/scripts/tests/therock_configure_ci_test.py new file mode 100644 index 0000000000000..50ef66cc00d17 --- /dev/null +++ b/.github/scripts/tests/therock_configure_ci_test.py @@ -0,0 +1,219 @@ +from pathlib import Path +import os +import sys +import unittest +from unittest.mock import patch, MagicMock + +sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) +import therock_configure_ci + + +class ConfigureCITest(unittest.TestCase): + @patch("subprocess.run") + def test_pull_request(self, mock_run): + args = { + "is_pull_request": True, + } + + mock_process = MagicMock() + mock_process.stdout = "projects/rocprim/src/main.cpp\nprojects/hipcub/src/main.cpp\nprojects/rocwmma/src/main.cpp" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertIn("rocprim", str(project_to_run)) + self.assertIn("hipcub", str(project_to_run)) + self.assertIn("rocwmma", str(project_to_run)) + self.assertEqual(test_type, "full") + + @patch("subprocess.run") + def test_pull_request_empty(self, mock_run): + args = {"is_pull_request": True, "input_subtrees": ""} + + mock_process = MagicMock() + mock_process.stdout = "" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertEqual(len(project_to_run), 0) + + @patch("subprocess.run") + def test_workflow_dispatch(self, mock_run): + args = { + "is_workflow_dispatch": True, + "input_projects": "projects/rocprim projects/hipcub", + } + + mock_process = MagicMock() + mock_process.stdout = "" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertIn("rocprim", str(project_to_run)) + self.assertIn("hipcub", str(project_to_run)) + self.assertEqual(test_type, "full") + + @patch("subprocess.run") + def test_workflow_dispatch_bad_input(self, mock_run): + args = { + "is_workflow_dispatch": True, + "input_projects": "projects/rocprim$$projects/hipcub", + } + + mock_process = MagicMock() + mock_process.stdout = "" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertEqual(len(project_to_run), 0) + + @patch("subprocess.run") + def test_workflow_dispatch_all(self, mock_run): + args = {"is_workflow_dispatch": True, "input_projects": "all"} + + mock_process = MagicMock() + mock_process.stdout = "" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertGreaterEqual(len(project_to_run), 5) + self.assertEqual(test_type, "full") + + @patch("subprocess.run") + def test_workflow_dispatch_empty(self, mock_run): + args = {"is_workflow_dispatch": True, "input_projects": ""} + + mock_process = MagicMock() + mock_process.stdout = "" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertEqual(len(project_to_run), 0) + + @patch("subprocess.run") + def test_is_push(self, mock_run): + args = { + "is_push": True, + } + + mock_process = MagicMock() + mock_process.stdout = "projects/rocprim/src/main.cpp" + mock_run.return_value = mock_process + + project_to_run, test_type = therock_configure_ci.retrieve_projects(args) + self.assertIn("rocprim", str(project_to_run)) + self.assertEqual(test_type, "full") + + def test_is_path_workflow_file_related_to_ci(self): + workflow_path = ".github/workflows/therocktest.yml" + self.assertTrue( + therock_configure_ci.is_path_workflow_file_related_to_ci(workflow_path) + ) + script_path = ".github/scripts/therocktest.py" + self.assertTrue( + therock_configure_ci.is_path_workflow_file_related_to_ci(script_path) + ) + bad_path = ".github/workflows/test.yml" + self.assertFalse( + therock_configure_ci.is_path_workflow_file_related_to_ci(bad_path) + ) + + def test_is_path_skippable(self): + # Skippable paths + self.assertTrue(therock_configure_ci.is_path_skippable("README.md")) + self.assertTrue(therock_configure_ci.is_path_skippable("docs/guide.rst")) + self.assertTrue( + therock_configure_ci.is_path_skippable("projects/rocprim/.gitignore") + ) + self.assertTrue( + therock_configure_ci.is_path_skippable("projects/hipcub/CHANGELOG.md") + ) + self.assertTrue( + therock_configure_ci.is_path_skippable( + "projects/rocwmma/docs/sphinx/requirements.in" + ) + ) + self.assertTrue( + therock_configure_ci.is_path_skippable( + "shared/tensile/docs/sphinx/requirements.in" + ) + ) + # Non-skippable paths + self.assertFalse( + therock_configure_ci.is_path_skippable("projects/rocprim/src/main.cpp") + ) + self.assertFalse(therock_configure_ci.is_path_skippable("CMakeLists.txt")) + + def test_check_for_non_skippable_path(self): + # All skippable + self.assertFalse( + therock_configure_ci.check_for_non_skippable_path( + ["README.md", "docs/guide.rst", ".gitignore"] + ) + ) + # Contains non-skippable + self.assertTrue( + therock_configure_ci.check_for_non_skippable_path( + ["README.md", "projects/rocprim/src/main.cpp"] + ) + ) + # None and empty + self.assertFalse(therock_configure_ci.check_for_non_skippable_path(None)) + self.assertFalse(therock_configure_ci.check_for_non_skippable_path([])) + + @patch("therock_configure_ci.get_modified_paths") + def test_retrieve_projects_skips_ci_for_skippable_paths(self, mock_get_modified): + mock_get_modified.return_value = [ + "README.md", + "docs/guide.rst", + "projects/rocprim/.gitignore", + ] + + projects, test_type = therock_configure_ci.retrieve_projects( + {"is_pull_request": True, "base_ref": "HEAD^"} + ) + + self.assertEqual(projects, []) + self.assertEqual(test_type, "full") + + @patch("therock_configure_ci.get_modified_paths") + def test_retrieve_projects_runs_ci_for_non_skippable_paths(self, mock_get_modified): + mock_get_modified.return_value = ["README.md", "projects/rocprim/src/main.cpp"] + + projects, test_type = therock_configure_ci.retrieve_projects( + {"is_pull_request": True, "base_ref": "HEAD^"} + ) + + self.assertIn("rocprim", str(projects)) + self.assertEqual(test_type, "full") + + @patch("therock_configure_ci.get_modified_paths") + def test_retrieve_projects_runs_ci_for_two_projects(self, mock_get_modified): + mock_get_modified.return_value = [ + "README.md", + "projects/rocprim/src/main.cpp", + "projects/hipcub/src/main.cpp", + ] + + projects, test_type = therock_configure_ci.retrieve_projects( + {"is_pull_request": True, "base_ref": "HEAD^"} + ) + + self.assertIn("rocprim", str(projects)) + self.assertIn("hipcub", str(projects)) + self.assertEqual(test_type, "full") + + @patch("therock_configure_ci.get_modified_paths") + def test_retrieve_projects_runs_ci_for_workflow_paths(self, mock_get_modified): + mock_get_modified.return_value = [".github/workflows/therock-ci.yml"] + + projects, test_type = therock_configure_ci.retrieve_projects( + {"is_pull_request": True, "base_ref": "HEAD^"} + ) + + # All projects should be tested with smoke tests.. make sure we get at least 4 projects + self.assertGreaterEqual(len(projects), 5) + self.assertEqual(test_type, "smoke") + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/tests/therock_matrix_test.py b/.github/scripts/tests/therock_matrix_test.py new file mode 100644 index 0000000000000..223c3ebd9701e --- /dev/null +++ b/.github/scripts/tests/therock_matrix_test.py @@ -0,0 +1,43 @@ +from pathlib import Path +import os +import sys +import unittest + +sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) +import therock_matrix + + +class TheRockMatrixTest(unittest.TestCase): + def test_collect_projects_to_run_without_additional_option(self): + subtrees = ["projects/hipblaslt"] + + project_to_run = therock_matrix.collect_projects_to_run(subtrees) + self.assertEqual(len(project_to_run), 1) + + def test_collect_projects_to_run(self): + subtrees = ["projects/rocsparse", "projects/hipblaslt"] + + project_to_run = therock_matrix.collect_projects_to_run(subtrees) + self.assertEqual(len(project_to_run), 1) + + def test_collect_projects_to_run_additional_option(self): + subtrees = ["projects/rocsparse"] + + project_to_run = therock_matrix.collect_projects_to_run(subtrees) + self.assertEqual(len(project_to_run), 1) + + def test_collect_projects_to_run_dependency_graph(self): + subtrees = ["projects/miopen", "projects/hipblaslt"] + + project_to_run = therock_matrix.collect_projects_to_run(subtrees) + self.assertEqual(len(project_to_run), 1) + + def test_collect_projects_to_run_dependency_graph_diff_projects(self): + subtrees = ["projects/miopen", "projects/rocwmma"] + + project_to_run = therock_matrix.collect_projects_to_run(subtrees) + self.assertEqual(len(project_to_run), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/therock_configure_ci.py b/.github/scripts/therock_configure_ci.py new file mode 100644 index 0000000000000..c79d75508a0c1 --- /dev/null +++ b/.github/scripts/therock_configure_ci.py @@ -0,0 +1,204 @@ +""" +This script determines which build flag and tests to run based on SUBTREES + +Required environment variables: + - SUBTREES +""" + +import fnmatch +import json +import logging +import subprocess +from pathlib import Path +import sys +from therock_matrix import subtree_to_project_map, collect_projects_to_run +import time +from typing import Mapping, Optional, Iterable +import os +from pr_detect_changed_subtrees import get_valid_prefixes, find_matched_subtrees +from config_loader import load_repo_config + +logging.basicConfig(level=logging.INFO) +SCRIPT_DIR = Path(__file__).resolve().parent + +# Paths matching any of these patterns are considered to have no influence over +# build or test workflows so any related jobs can be skipped if all paths +# modified by a commit/PR match a pattern in this list. +SKIPPABLE_PATH_PATTERNS = [ + "docs/*", + ".gitignore", + "*.md", + "*.rst", + "projects/*/docs/*", + "projects/*/.gitignore", + "projects/*/*.md", + "projects/*/*.rst", + "shared/*/docs/*", + "shared/*/.gitignore", + "shared/*/*.md", + "shared/*/*.rst", +] + + +def is_path_skippable(path: str) -> bool: + """Determines if a given relative path to a file matches any skippable patterns.""" + return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS) + + +def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool: + """Returns true if at least one path is not in the skippable set.""" + if paths is None: + return False + return any(not is_path_skippable(p) for p in paths) + + +def set_github_output(d: Mapping[str, str]): + """Sets GITHUB_OUTPUT values. + See https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs + """ + logging.info(f"Setting github output:\n{d}") + step_output_file = os.environ.get("GITHUB_OUTPUT", "") + if not step_output_file: + logging.warning( + "Warning: GITHUB_OUTPUT env var not set, can't set github outputs" + ) + return + with open(step_output_file, "a") as f: + f.writelines(f"{k}={v}" + "\n" for k, v in d.items()) + + +def retry(max_attempts, delay_seconds, exceptions): + def decorator(func): + def newfn(*args, **kwargs): + attempt = 0 + while attempt < max_attempts: + try: + return func(*args, **kwargs) + except exceptions as e: + print( + f"Exception {str(e)} thrown when attempting to run , attempt {attempt} of {max_attempts}" + ) + attempt += 1 + if attempt < max_attempts: + backoff = delay_seconds * (2 ** (attempt - 1)) + time.sleep(backoff) + return func(*args, **kwargs) + + return newfn + + return decorator + + +@retry(max_attempts=3, delay_seconds=2, exceptions=(TimeoutError)) +def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]: + """Returns the paths of modified files relative to the base reference.""" + return subprocess.run( + ["git", "diff", "--name-only", base_ref], + stdout=subprocess.PIPE, + check=True, + text=True, + timeout=60, + ).stdout.splitlines() + + +GITHUB_WORKFLOWS_CI_PATTERNS = [ + "therock*", +] + + +def is_path_workflow_file_related_to_ci(path: str) -> bool: + return any( + fnmatch.fnmatch(path, ".github/workflows/" + pattern) + for pattern in GITHUB_WORKFLOWS_CI_PATTERNS + ) or any( + fnmatch.fnmatch(path, ".github/scripts/" + pattern) + for pattern in GITHUB_WORKFLOWS_CI_PATTERNS + ) + + +def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool: + if paths is None: + return False + return any(is_path_workflow_file_related_to_ci(p) for p in paths) + + +def get_changed_path_projects(paths: Optional[Iterable[str]]) -> Iterable[str]: + repo_config_path = Path(SCRIPT_DIR / ".." / "repos-config.json") + config = load_repo_config(str(repo_config_path)) + valid_prefixes = get_valid_prefixes(config) + matched_subtrees = find_matched_subtrees(paths, valid_prefixes) + return matched_subtrees + + +def retrieve_projects(args): + # For pushes and pull_requests, we only want to test changed projects + base_ref = args.get("base_ref") + modified_paths = get_modified_paths(base_ref) + + # by default, we select full tests + test_type = "full" + + # Check if CI should be skipped based on modified paths + # (only for push and pull_request events, not workflow_dispatch or nightly) + if args.get("is_push") or args.get("is_pull_request"): + paths_set = set(modified_paths) + contains_non_skippable_files = check_for_non_skippable_path(paths_set) + + # If only skippable paths were modified, skip CI + if not contains_non_skippable_files: + logging.info("Only skippable paths were modified, skipping CI") + return [], test_type + + subtrees = get_changed_path_projects(modified_paths) + + if args.get("is_workflow_dispatch"): + if args.get("input_projects") == "all": + subtrees = list(subtree_to_project_map.keys()) + else: + subtrees = args.get("input_projects").split() + + # If .github/*/therock* were changed for a push or pull request, run all subtrees + if args.get("is_push") or args.get("is_pull_request"): + related_to_therock_ci = check_for_workflow_file_related_to_ci(modified_paths) + if related_to_therock_ci: + logging.info( + "Enabling all projects since a related workflow file was modified" + ) + subtrees = list(subtree_to_project_map.keys()) + test_type = "smoke" + + # for nightly runs, run everything with full tests + if args.get("is_nightly"): + subtrees = list(subtree_to_project_map.keys()) + + project_to_run = collect_projects_to_run(subtrees) + + return project_to_run, test_type + + +def run(args): + platform = args.get("platform") + project_to_run, test_type = retrieve_projects(args) + set_github_output( + {f"{platform}_projects": json.dumps(project_to_run), "test_type": test_type} + ) + + +if __name__ == "__main__": + args = {} + github_event_name = os.getenv("GITHUB_EVENT_NAME") + platform = os.getenv("PLATFORM") + args["platform"] = platform + args["is_pull_request"] = github_event_name == "pull_request" + args["is_push"] = github_event_name == "push" + args["is_workflow_dispatch"] = github_event_name == "workflow_dispatch" + args["is_nightly"] = github_event_name == "schedule" + + input_projects = os.getenv("PROJECTS", "") + args["input_projects"] = input_projects + + args["base_ref"] = os.environ.get("BASE_REF", "HEAD^") + + logging.info(f"Retrieved arguments {args}") + + run(args) diff --git a/.github/scripts/therock_matrix.py b/.github/scripts/therock_matrix.py new file mode 100644 index 0000000000000..92fa780b03e37 --- /dev/null +++ b/.github/scripts/therock_matrix.py @@ -0,0 +1,184 @@ +""" +This dictionary is used to map specific file directory changes to the corresponding build flag and tests +""" + +import os + +subtree_to_project_map = { + "projects/hipblas": "blas", + "projects/hipblas-common": "blas", + "projects/hipblaslt": "blas", + "projects/hipcub": "prim", + "projects/hipdnn": "hipdnn", + "projects/hipfft": "fft", + "projects/hiprand": "rand", + "projects/hipsolver": "solver", + "projects/hipsparse": "sparse", + "projects/hipsparselt": "sparse", + "projects/miopen": "miopen", + "projects/rocblas": "blas", + "project/rocfft": "fft", + "projects/rocprim": "prim", + "projects/rocrand": "rand", + "projects/rocsolver": "solver", + "projects/rocsparse": "sparse", + "projects/rocthrust": "prim", + "projects/rocwmma": "rocwmma", + "shared/mxdatagenerator": "blas", + "shared/origami": "blas", + "shared/rocroller": "blas", + "shared/tensile": "blas", +} + +project_map = { + "prim": { + "cmake_options": ["-DTHEROCK_ENABLE_PRIM=ON"], + "project_to_test": ["rocprim", "rocthrust", "hipcub"], + }, + "rand": { + "cmake_options": ["-DTHEROCK_ENABLE_RAND=ON"], + "project_to_test": ["rocrand", "hiprand"], + }, + "blas": { + "cmake_options": ["-DTHEROCK_ENABLE_BLAS=ON"], + "project_to_test": ["hipblaslt", "rocblas", "hipblas", "rocroller"], + }, + "miopen": { + "cmake_options": [ + "-DTHEROCK_ENABLE_MIOPEN=ON", + "-DTHEROCK_ENABLE_MIOPEN_PLUGIN=ON", + ], + "additional_flags": { + # As composable_kernel is not enabled for Windows, we only enable these flags during Linux builds + "linux": [ + "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON", + "-DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON", + "-DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel", + ] + }, + "project_to_test": ["miopen", "miopen_plugin"], + }, + "fft": { + "cmake_options": ["-DTHEROCK_ENABLE_FFT=ON", "-DTHEROCK_ENABLE_RAND=ON"], + "project_to_test": ["hipfft", "rocfft"], + }, + "hipdnn": { # due to MIOpen plugin project being inside the hipDNN directory, we cannot have the MIOpen plugin project as a separate project for now https://github.com/ROCm/rocm-libraries/issues/2316 + "cmake_options": ["-DTHEROCK_ENABLE_MIOPEN_PLUGIN=ON"], + "additional_flags": { + # As composable_kernel is not enabled for Windows, we only enable these flags during Linux builds + "linux": [ + "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON", + "-DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON", + "-DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel", + ] + }, + "project_to_test": ["hipdnn", "miopen_plugin"], + }, + "rocwmma": { + "cmake_options": ["-DTHEROCK_ENABLE_ROCWMMA=ON"], + "project_to_test": ["rocwmma"], + }, +} + +# For certain math components, they are optional during building and testing. +# As they are optional, we do not want to include them as default as this takes more time in the CI. +# However, if we run a separate build for optional components, those files will be overriden as these components share the same umbrella as other projects +# Example: SPARSE is included in BLAS, but a separate build would cause overwriting of the blas_lib.tar.xz and blas_test.tar.xz and be missing libraries and tests +additional_options = { + "sparse": { + "cmake_options": ["-DTHEROCK_ENABLE_SPARSE=ON"], + "project_to_test": ["rocsparse", "hipsparse", "hipsparselt"], + "project_to_add": "blas", + }, + "solver": { + "cmake_options": ["-DTHEROCK_ENABLE_SOLVER=ON"], + "project_to_test": ["rocsolver", "hipsolver"], + "project_to_add": "blas", + }, +} + +# If a project has dependencies that are also being built, we combine build options and test options +# This way, there will be no S3 upload overlap and we save redundant builds +dependency_graph = { + "miopen": ["blas", "rand"], +} + + +def collect_projects_to_run(subtrees): + platform = os.getenv("PLATFORM") + projects = set() + # collect the associated subtree to project + for subtree in subtrees: + if subtree in subtree_to_project_map: + projects.add(subtree_to_project_map.get(subtree)) + + for project in list(projects): + # Check if an optional math component was included. + if project in additional_options: + project_options_to_add = additional_options[project] + + project_to_add = project_options_to_add["project_to_add"] + # If `project_to_add` is in included, add options to the existing `project_map` entry + if project_to_add in projects: + project_map[project_to_add]["cmake_options"].extend( + project_options_to_add["cmake_options"] + ) + project_map[project_to_add]["project_to_test"].extend( + project_options_to_add["project_to_test"] + ) + # If `project_to_add` is not included, only run build and tests for the optional project + else: + projects.add(project_to_add) + project_map[project_to_add]["cmake_options"] = project_options_to_add[ + "cmake_options" + ] + project_map[project_to_add]["project_to_test"] = project_options_to_add[ + "project_to_test" + ] + + # Check for potential dependencies + to_remove_from_project_map = [] + for project in list(projects): + # Check if project has a dependency combine + if project in dependency_graph: + for dependency in dependency_graph[project]: + # If the dependency is also included, let's combine to avoid overlap + if dependency in projects: + project_map[project]["cmake_options"].extend( + project_map[dependency]["cmake_options"] + ) + project_map[project]["project_to_test"].extend( + project_map[dependency]["project_to_test"] + ) + to_remove_from_project_map.append(dependency) + + # if dependency is included in projects and parent is found, we delete the dependency as the parent will build and test + for to_remove_item in to_remove_from_project_map: + projects.remove(to_remove_item) + del project_map[to_remove_item] + + # retrieve the subtrees to checkout, cmake options to build, and projects to test + project_to_run = [] + for project in projects: + if project in project_map: + project_map_data = project_map.get(project) + + # Check if platform-based additional flags are needed + if ( + "additional_flags" in project_map_data + and platform in project_map_data["additional_flags"] + ): + project_map_data["cmake_options"].extend( + project_map_data["additional_flags"][platform] + ) + + # To save time, only build what is needed + project_map_data["cmake_options"].extend(["-DTHEROCK_ENABLE_ALL=OFF"]) + + cmake_flag_options = " ".join(project_map_data["cmake_options"]) + project_to_test_options = ",".join(project_map_data["project_to_test"]) + project_map_data["cmake_options"] = cmake_flag_options + project_map_data["project_to_test"] = project_to_test_options + project_to_run.append(project_map_data) + + return project_to_run diff --git a/.github/workflows/build_linux_jax_wheels.yml b/.github/workflows/build_linux_jax_wheels.yml new file mode 100644 index 0000000000000..8b4f18ae5d9a7 --- /dev/null +++ b/.github/workflows/build_linux_jax_wheels.yml @@ -0,0 +1,290 @@ +name: Build Portable Linux JAX Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + rocm_version: + description: ROCm version to install + type: string + tar_url: + description: URL to TheRock tarball to build against + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + repository: + description: "Repository to checkout. Defaults to `ROCm/TheRock`." + type: string + default: "ROCm/TheRock" + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + required: true + type: string + default: "3.12" + release_type: + type: choice + description: Type of release to create. All developer-triggered jobs should use "dev"! + options: + - dev + - nightly + - prerelease + default: dev + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + rocm_version: + description: ROCm version to install + type: string + tar_url: + description: URL to TheRock tarball to build against + type: string + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + jax_ref: + description: rocm-jax repository ref/branch to check out + type: string + default: rocm-jaxlib-v0.8.0 + +permissions: + id-token: write + contents: read + +run-name: Build Linux JAX Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }}) + +jobs: + build_jax_wheels: + strategy: + matrix: + jax_ref: [rocm-jaxlib-v0.8.0] + name: Build Linux JAX Wheels | ${{ inputs.amdgpu_family }} | Python ${{ inputs.python_version }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + PACKAGE_DIST_DIR: ${{ github.workspace }}/jax/jax_rocm_plugin/wheelhouse + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + outputs: + cp_version: ${{ env.cp_version }} + jax_version: ${{ steps.extract_jax_version.outputs.jax_version }} + steps: + - name: Checkout TheRock + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Checkout JAX + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + path: jax + repository: rocm/rocm-jax + ref: ${{ matrix.jax_ref }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - name: "Setting up Python" + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Build JAX Wheels + env: + ROCM_VERSION: ${{ inputs.rocm_version }} + run: | + ls -lah + pushd jax + python3 build/ci_build \ + --compiler=clang \ + --python-versions="${{ inputs.python_version }}" \ + --rocm-version="${ROCM_VERSION}" \ + --therock-path="${{ inputs.tar_url }}" \ + dist_wheels + + - name: Extract JAX version + id: extract_jax_version + run: | + # Extract JAX version from requirements.txt (e.g., "jax==0.8.0") + # Remove all whitespace from requirements.txt to simplify parsing + # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments) + # Extract the version number by splitting on '=' and taking the 3rd field + # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments + JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \ + | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3) + echo "jax_version=$JAX_VERSION" >> "$GITHUB_OUTPUT" + + - name: Install AWS CLI + if: always() + run: bash ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Upload wheels to S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index + if: ${{ github.repository_owner == 'ROCm' }} + run: | + python3 -m venv .venv + source .venv/bin/activate + pip3 install boto3 packaging + python3 ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "linux" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_jax_wheels: + name: Test JAX wheels | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + needs: [build_jax_wheels, generate_target_to_run] + permissions: + contents: read + packages: read + uses: ./.github/workflows/test_linux_jax_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + release_type: ${{ inputs.release_type }} + s3_subdir: ${{ inputs.s3_subdir }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + rocm_version: ${{ inputs.rocm_version }} + tar_url: ${{ inputs.tar_url }} + python_version: ${{ inputs.python_version }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + jax_ref: ${{ inputs.jax_ref }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + + upload_jax_wheels: + name: Release JAX Wheels to S3 + needs: [build_jax_wheels, generate_target_to_run, test_jax_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + JAX_VERSION: "${{ needs.build_jax_wheels.outputs.jax_version }}" + ROCM_VERSION: "${{ inputs.rocm_version }}" + CP_VERSION: "${{ needs.build_jax_wheels.outputs.cp_version }}" + + steps: + - name: Checkout + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8 # v5.1.0 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_jax_wheels.result }} + TEST_RESULT: ${{ needs.test_jax_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy JAX wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "jaxlib-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_27_x86_64.whl" \ + --include "jax_rocm7_plugin-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_28_x86_64.whl" \ + --include "jax_rocm7_pjrt-${JAX_VERSION}+rocm${ROCM_VERSION}-py3-none-manylinux_2_28_x86_64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/build_native_linux_packages.yml b/.github/workflows/build_native_linux_packages.yml new file mode 100644 index 0000000000000..ead640630e25c --- /dev/null +++ b/.github/workflows/build_native_linux_packages.yml @@ -0,0 +1,135 @@ +name: Build Native Linux Packages + +on: + workflow_call: + inputs: + artifact_group: + description: gfx arch group for the s3 server + type: string + default: gfx94X-dcgpu + artifact_run_id: + description: workflow run id to download the artifacts from. + required: true + type: string + rocm_version: + description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...). + required: true + type: string + native_package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: string + package_suffix: + description: The suffix to be added to package name (asan, static or rpath). + required: false + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: false + type: string + workflow_dispatch: + inputs: + artifact_group: + type: string + default: gfx94X-dcgpu + artifact_run_id: + description: workflow run id to download the artifacts from + type: string + rocm_version: + description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...). + type: string + default: "0.0.1" + native_package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: choice + options: + - rpm + - deb + default: "rpm" + package_suffix: + description: The suffix to be added to package name (asan, static or rpath). + type: string + required: false + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + +permissions: + id-token: write + contents: read + +run-name: Build native Linux packages (${{ inputs.artifact_group }}, ${{ inputs.rocm_version }}, ${{ inputs.native_package_type }}, ${{ inputs.package_suffix }}, ${{ inputs.release_type }}) + +jobs: + build_native_packages: + name: Build Linux native Packages + strategy: + fail-fast: false + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + ARTIFACT_RUN_ID: ${{ inputs.artifact_run_id || github.run_id }} + PACKAGE_SUFFIX: ${{ inputs.package_suffix != '' && inputs.package_suffix || '' }} + OUTPUT_DIR: ${{ github.workspace }}/output + ARTIFACTS_DIR: ${{ github.workspace }}/output/artifacts + PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages + RELEASE_TYPE: ${{ inputs.release_type || '' }} + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + - name: Install Python requirements + run: | + pip install pyelftools boto3 jinja2 + + - name: Install System requirements + run: | + # Install the needed tools for creating rpm / deb packages + # Also install tools for creating repo files + sudo apt update + sudo apt install -y llvm + sudo apt install -y rpm debhelper-compat build-essential + sudo apt install -y dpkg-dev createrepo-c + + - name: Fetch Artifacts + run: | + echo "Fetching artifacts for build ${{ inputs.artifact_run_id }}" + python ./build_tools/fetch_artifacts.py \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --run-github-repo="ROCm/TheRock" \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir=${{ env.ARTIFACTS_DIR }} + + - name: Build Packages + id: build-packages + run: | + echo "Building ${{ inputs.native_package_type }} packages for ${{ inputs.artifact_group }} ${{ inputs.artifact_run_id }}" + python ./build_tools/packaging/linux/build_package.py \ + --dest-dir ${{ env.PACKAGE_DIST_DIR }} \ + --rocm-version ${{ inputs.rocm_version }} \ + --target ${{ inputs.artifact_group }} \ + --artifacts-dir ${{ env.ARTIFACTS_DIR }} \ + --pkg-type ${{ inputs.native_package_type }} \ + --version-suffix ${{ env.ARTIFACT_RUN_ID }} + + - name: Install AWS CLI + run: bash ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials for non-forked repos + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + + - name: Upload Package repo to S3 + id: upload-packages + run: | + echo "Uploading to s3 bucket" + python ./build_tools/packaging/linux/upload_package_repo.py \ + --pkg-type ${{ inputs.native_package_type }} \ + --s3-bucket therock-deb-rpm-test \ + --amdgpu-family ${{ inputs.artifact_group }} \ + --artifact-id ${{ env.ARTIFACT_RUN_ID }} diff --git a/.github/workflows/build_portable_linux_artifacts.yml b/.github/workflows/build_portable_linux_artifacts.yml new file mode 100644 index 0000000000000..8e170a14f8e83 --- /dev/null +++ b/.github/workflows/build_portable_linux_artifacts.yml @@ -0,0 +1,204 @@ +name: Build Portable Linux Artifacts + +on: + workflow_dispatch: + inputs: + amdgpu_families: + type: string + default: gfx94X-dcgpu + artifact_group: + type: string + default: gfx94X-dcgpu + build_variant_label: + type: string + description: "A label for the build variant (ex: 'release', 'asan')" + default: "release" + build_variant_suffix: + type: string + description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')" + default: "" + build_variant_cmake_preset: + type: string + description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')" + default: "" + package_version: + type: string + default: ADHOCBUILD + expect_failure: + type: boolean + default: false + extra_cmake_options: + type: string + + workflow_call: + inputs: + package_version: + type: string + default: ADHOCBUILD + amdgpu_families: + type: string + artifact_group: + type: string + build_variant_label: + type: string + build_variant_suffix: + type: string + build_variant_cmake_preset: + type: string + expect_failure: + type: boolean + extra_cmake_options: + type: string + +# See the details regarding permissions from the link: +# https://github.com/aws-actions/configure-aws-credentials?tab=readme-ov-file#oidc +permissions: + contents: read + +jobs: + build_portable_linux_artifacts: + name: Build (xfail ${{ inputs.expect_failure }}) + # azure-linux-scale-rocm are used for regular CI builds + # azure-linux-scale-rocm-heavy are used for CI builds that require more resources (ex: ASAN builds) + runs-on: ${{ inputs.build_variant_label == 'asan' && 'azure-linux-u2404-hx176-cpu-rocm' || 'azure-linux-scale-rocm' }} + continue-on-error: ${{ inputs.expect_failure }} + timeout-minutes: 720 # 12 hour timeout + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + CACHE_DIR: ${{ github.workspace }}/.container-cache + # The ccache.conf will be written by setup_ccache.py before this gets used. + CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + TEATIME_FORCE_INTERACTIVE: 0 + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + fetch-depth: 10 + + - name: SHA of TheRock + run: | + git config --global --add safe.directory /__w/llvm-project/llvm-project + git log -1 + cd compiler/amd-llvm + git log -3 + cd - + + - name: Install python deps + run: | + pip install -r requirements.txt + + # safe.directory must be set before Runner Health Status + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Test build_tools + run: | + python -m pytest build_tools/tests build_tools/github_actions/tests + + - name: Fetch sources + timeout-minutes: 30 + run: | + ./build_tools/fetch_sources.py --jobs 12 + + - name: "Checking out repository for llvm-project" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + path: compiler/amd-llvm + + - name: Apply patches + run: | + cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm + cd compiler/amd-llvm + git config --global --add safe.directory /__w/llvm-project/llvm-project + find . -type f -name '*.patch' -exec git apply --check {} \; + find . -type f -name '*.patch' -exec git apply {} \; + git log -15 + cd - + + - name: Configure Projects + env: + cmake_preset: ${{ inputs.build_variant_cmake_preset }} + amdgpu_families: ${{ inputs.amdgpu_families }} + package_version: ${{ inputs.package_version }} + extra_cmake_options: ${{ inputs.extra_cmake_options }} + BUILD_DIR: build + run: | + python3 build_tools/github_actions/build_configure.py --manylinux + + - name: Build therock-archives and therock-dist + run: | + cmake --build build --target therock-archives therock-dist -- -k 0 + + - name: Test Packaging + if: ${{ github.event.repository.name == 'TheRock' }} + run: | + ctest --test-dir build --output-on-failure + + - name: Report + if: ${{ !cancelled() }} + shell: bash + run: | + if [ -d "./build" ]; then + echo "Full SDK du:" + echo "------------" + du -h -d 1 build/dist/rocm + echo "Artifact Archives:" + echo "------------------" + ls -lh build/artifacts/*.tar.xz + echo "Artifacts:" + echo "----------" + du -h -d 1 build/artifacts + echo "CCache Stats:" + echo "-------------" + ccache -s -v + tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log + else + echo "[ERROR] Build directory ./build does not exist. Skipping report!" + echo " This should only happen if the CI is cancelled before the build step." + exit 1 + fi + + # Analyze ninja build log to generate per-component timing report + - name: Analyze Build Times + if: ${{ !cancelled() }} + run: | + python3 build_tools/analyze_build_times.py --build-dir build + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Post Build Upload + if: always() + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group "${{ inputs.artifact_group }}" \ + --build-dir build \ + --upload diff --git a/.github/workflows/build_portable_linux_python_packages.yml b/.github/workflows/build_portable_linux_python_packages.yml new file mode 100644 index 0000000000000..07678007ee0a8 --- /dev/null +++ b/.github/workflows/build_portable_linux_python_packages.yml @@ -0,0 +1,94 @@ +name: Build Portable Linux Python Packages + +on: + workflow_dispatch: + inputs: + artifact_github_repo: + description: GitHub repository for artifact_run_id + type: string + default: ROCm/TheRock + artifact_run_id: + description: Workflow run ID to download artifacts from + type: string + default: "17865324892" # TODO: default to the most recent successful run (using a script) + artifact_group: + description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)" + type: string + package_version: + type: string + workflow_call: + inputs: + artifact_github_repo: + type: string + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + package_version: + type: string + +permissions: + contents: read + +run-name: Build portable Linux Python Packages (${{ inputs.artifact_group }}, ${{ inputs.package_version }}) + +jobs: + build: + name: Build Python | ${{ inputs.artifact_group }} + # Note: GitHub-hosted runners run out of disk space for some gpu families + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + ARTIFACTS_DIR: "${{ github.workspace }}/artifacts" + PACKAGES_DIR: "${{ github.workspace }}/packages" + MANYLINUX: 1 + + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + + - name: Install Python requirements + run: pip install boto3 packaging piprepo setuptools + + # Note: we could fetch "all" artifacts if we wanted to include more files + - name: Fetch artifacts + env: + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + run: | + python ./build_tools/fetch_artifacts.py \ + --run-github-repo=${{ inputs.artifact_github_repo }} \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir=${{ env.ARTIFACTS_DIR }} \ + _dev_ _lib_ _run_ + + - name: Build Python packages + run: | + ./build_tools/linux_portable_build.py \ + --image=${{ env.BUILD_IMAGE }} \ + --output-dir=${{ env.PACKAGES_DIR }} \ + --artifact-dir=${{ env.ARTIFACTS_DIR }} \ + --build-python-only \ + -- \ + "--version=${{ inputs.package_version }}" + + - name: Inspect Python packages + run: | + ls -la "${{ env.PACKAGES_DIR }}" + + # TODO(#1559): Sanity check (Linux can't find the directories, maybe Docker issues?) + + # - name: Sanity check Python packages + # run: | + # piprepo build "${{ env.PACKAGES_DIR }}/dist" + # pip install rocm[devel]==${{ inputs.package_version }} \ + # --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/" + # rocm-sdk test + + # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket diff --git a/.github/workflows/build_portable_linux_pytorch_wheels.yml b/.github/workflows/build_portable_linux_pytorch_wheels.yml new file mode 100644 index 0000000000000..59a811ee6c0f2 --- /dev/null +++ b/.github/workflows/build_portable_linux_pytorch_wheels.yml @@ -0,0 +1,325 @@ +name: Build Portable Linux PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + required: true + type: string + default: "3.12" + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + default: "release/2.7" + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + default: "rocm_2.7" + +permissions: + id-token: write + contents: read + +run-name: Build portable Linux PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }}) + +jobs: + build_pytorch_wheels: + name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + env: + OUTPUT_DIR: ${{ github.workspace }}/output + PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages/dist + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + optional_build_prod_arguments: "" + outputs: + cp_version: ${{ env.cp_version }} + # The following are python package versions produced by the build. The + # exact versions will depend on workflow inputs and the underlying code. + # For example: + # Inputs + # rocm_version : 7.10.0a20251120 + # pytorch_git_ref : release/2.9 + # Outputs + # torch_version : 2.9.1+rocm7.10.0a20251120 + # torchaudio_version : 2.9.0+rocm7.10.0a20251120 + # torchvision_version: 0.24.0+rocm7.10.0a20251120 + # triton_version : 3.5.1+rocm7.10.0a20251120 + # Future jobs can use these version outputs to identify newly built + # packages, for example via `pip install torch==${TORCH_VERSION}`. + torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }} + torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }} + torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }} + triton_version: ${{ steps.build-pytorch-wheels.outputs.triton_version }} + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Add selected Python version to PATH + run: | + python_dir="/opt/python/${{ env.cp_version }}" + if ! [ -x "${python_dir}/bin/python" ]; then + echo "ERROR: Could not find python: ${python_dir}" + exit 1 + fi + echo "${python_dir}/bin" >> "$GITHUB_PATH" + + # Checkout nightly sources from https://github.com/pytorch/pytorch + - name: Checkout PyTorch Source Repos from nightly branch + if: ${{ inputs.pytorch_git_ref == 'nightly' }} + run: | + ./external-builds/pytorch/pytorch_torch_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_audio_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_vision_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_triton_repo.py checkout --patch --patchset nightly + + # Checkout stable sources from https://github.com/ROCm/pytorch + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ inputs.pytorch_git_ref != 'nightly' }} + run: | + ./external-builds/pytorch/pytorch_torch_repo.py checkout --gitrepo-origin https://github.com/ROCm/pytorch.git --repo-hashtag ${{ inputs.pytorch_git_ref }} --patchset ${{ inputs.pytorch_patchset }} + ./external-builds/pytorch/pytorch_audio_repo.py checkout --require-related-commit + ./external-builds/pytorch/pytorch_vision_repo.py checkout --require-related-commit + ./external-builds/pytorch/pytorch_triton_repo.py checkout + + - name: Create pip cache directory + run: mkdir -p /tmp/pipcache + + - name: Determine optional arguments passed to `build_prod_wheels.py` + if: ${{ inputs.rocm_version }} + run: | + pip install packaging + python build_tools/github_actions/determine_version.py \ + --rocm-version ${{ inputs.rocm_version }} + + - name: Build PyTorch Wheels + id: build-pytorch-wheels + run: | + echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}" + ./external-builds/pytorch/build_prod_wheels.py \ + build \ + --install-rocm \ + --pip-cache-dir /tmp/pipcache \ + --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" \ + --clean \ + --output-dir ${{ env.PACKAGE_DIST_DIR }} ${{ env.optional_build_prod_arguments }} + python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }} + + - name: Sanity Check Wheel + run: | + python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }}/ + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Upload wheels to S3 staging + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "linux" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_pytorch_wheels: + name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }} + needs: [build_pytorch_wheels, generate_target_to_run] + uses: ./.github/workflows/test_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + python_version: ${{ inputs.python_version }} + torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }} + pytorch_git_ref: ${{ inputs.pytorch_git_ref }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + upload_pytorch_wheels: + name: Release PyTorch Wheels to S3 + needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}" + TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}" + TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}" + TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}" + TRITON_VERSION: "${{ needs.build_pytorch_wheels.outputs.triton_version }}" + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }} + TEST_RESULT: ${{ needs.test_pytorch_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy PyTorch wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "torch-${TORCH_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "triton-${TRITON_VERSION}-${CP_VERSION}-linux_x86_64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/build_windows_artifacts.yml b/.github/workflows/build_windows_artifacts.yml new file mode 100644 index 0000000000000..189694257d579 --- /dev/null +++ b/.github/workflows/build_windows_artifacts.yml @@ -0,0 +1,229 @@ +name: Build Windows Artifacts + +on: + workflow_dispatch: + inputs: + amdgpu_families: + type: string + default: gfx1151 + artifact_group: + type: string + default: gfx1151 + build_variant_label: + type: string + description: "A label for the build variant (ex: 'release', 'asan')" + default: "release" + build_variant_suffix: + type: string + description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')" + default: "" + build_variant_cmake_preset: + type: string + description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')" + default: "" + package_version: + type: string + default: ADHOCBUILD + expect_failure: + type: boolean + extra_cmake_options: + type: string + + workflow_call: + inputs: + package_version: + type: string + default: ADHOCBUILD + amdgpu_families: + type: string + artifact_group: + type: string + build_variant_label: + type: string + build_variant_suffix: + type: string + build_variant_cmake_preset: + type: string + expect_failure: + type: boolean + extra_cmake_options: + type: string + +permissions: + contents: read + +jobs: + build_windows_artifacts: + name: Build ${{ inputs.build_variant_label }} (xfail ${{ inputs.expect_failure }}) + runs-on: azure-windows-scale-rocm + continue-on-error: ${{ inputs.expect_failure }} + timeout-minutes: 720 # 12 hour timeout + permissions: + id-token: write + defaults: + run: + shell: bash + strategy: + fail-fast: true + env: + BUILD_DIR: B:\build + CACHE_DIR: "${{github.workspace}}/.cache" + CCACHE_DIR: "${{github.workspace}}/.cache/ccache" + CCACHE_MAXSIZE: "4000M" + TEATIME_FORCE_INTERACTIVE: 0 + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + fetch-depth: 10 + + - name: SHA of TheRock + run: | + git rev-parse HEAD + git log -1 + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Install python deps + run: | + pip install -r requirements.txt + + - name: Install requirements + # The first two lines removes the default commmunity feed and uses the internal proxy feed + run: | + choco source disable -n=chocolatey + choco source add -n=internal -s http://10.0.167.96:8081/repository/choco-group/ --priority=1 + choco install --no-progress -y ccache + # ninja pinned due to a bug in the 1.13.0 release: + # https://github.com/ninja-build/ninja/issues/2616 + choco install --no-progress -y ninja --version 1.12.1 + choco install --no-progress -y strawberryperl + echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH + choco install --no-progress -y awscli + choco install --no-progress -y pkgconfiglite + echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + + - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0 + with: + version: '3.62.0' + + # After other installs, so MSVC get priority in the PATH. + - name: Configure MSVC + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Runner health status + run: | + ccache --zero-stats + python ./build_tools/health_status.py + + - name: Test build_tools + run: | + python -m pytest build_tools/tests build_tools/github_actions/tests + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Enable cache + uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: ${{ env.CACHE_DIR }} + key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }} + restore-keys: | + windows-build-packages-v4-${{ inputs.amdgpu_families }}- + + - name: Fetch sources + timeout-minutes: 30 + run: | + git config fetch.parallel 10 + git config --global core.symlinks true + git config --global core.longpaths true + python ./build_tools/fetch_sources.py --jobs 12 + + - name: "Checking out repository for llvm-project" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + path: compiler/amd-llvm + + - name: Apply patches + run: | + cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm + cd compiler/amd-llvm + git config --global --add safe.directory /__w/llvm-project/llvm-project + find . -type f -name '*.patch' -exec git apply --check {} \; + find . -type f -name '*.patch' -exec git apply {} \; + git log -15 + cd - + + - name: Configure Projects + env: + cmake_preset: ${{ inputs.build_variant_cmake_preset }} + amdgpu_families: ${{ inputs.amdgpu_families }} + package_version: ${{ inputs.package_version }} + extra_cmake_options: ${{ inputs.extra_cmake_options }} + run: | + # clear cache before build and after download + ccache -z + python3 build_tools/github_actions/build_configure.py + + - name: Build therock-archives and therock-dist + run: cmake --build "${{ env.BUILD_DIR }}" --target therock-archives therock-dist -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + shell: bash + run: | + if [ -d "${{ env.BUILD_DIR }}" ]; then + echo "Build dir:" + echo "------------" + ls -lh "${{ env.BUILD_DIR }}" + echo "Artifact Archives:" + echo "------------------" + ls -lh "${{ env.BUILD_DIR }}"/artifacts/*.tar.xz + echo "Artifacts:" + echo "----------" + du -h -d 1 "${{ env.BUILD_DIR }}"/artifacts + echo "CCache Stats:" + echo "-------------" + ccache -s + else + echo "[ERROR] Build directory ${{ env.BUILD_DIR }} does not exist. Skipping report!" + echo " This should only happen if the CI is cancelled before the build step." + exit 1 + fi + + - name: "Build size report" + if: always() + shell: powershell + run: | + $fs = Get-PSDrive -PSProvider "FileSystem" + $fsout = $fs | Select-Object -Property Name,Used,Free,Root + $fsout | % {$_.Used/=1GB;$_.Free/=1GB;$_} | Write-Host + get-disk | Select-object @{Name="Size(GB)";Expression={$_.Size/1GB}} | Write-Host + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + special-characters-workaround: true + + - name: Post Build Upload + if: always() + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group ${{ inputs.artifact_group }} \ + --build-dir ${{ env.BUILD_DIR }} \ + --upload + + - name: Save cache + uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + if: ${{ !cancelled() }} + with: + path: ${{ env.CACHE_DIR }} + key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }} diff --git a/.github/workflows/build_windows_python_packages.yml b/.github/workflows/build_windows_python_packages.yml new file mode 100644 index 0000000000000..ea1298411ac1c --- /dev/null +++ b/.github/workflows/build_windows_python_packages.yml @@ -0,0 +1,86 @@ +name: Build Windows Python Packages + +on: + workflow_dispatch: + inputs: + artifact_github_repo: + description: GitHub repository for artifact_run_id + type: string + default: ROCm/TheRock + artifact_run_id: + description: Workflow run ID to download artifacts from + type: string + default: "17865324892" # TODO: default to the most recent successful run (using a script) + artifact_group: + description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)" + type: string + package_version: + type: string + workflow_call: + inputs: + artifact_github_repo: + type: string + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + package_version: + type: string + +permissions: + contents: read + +jobs: + build: + name: Build Python | ${{ inputs.artifact_group }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }} + env: + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + ARTIFACTS_DIR: "${{ github.workspace }}/artifacts" + PACKAGES_DIR: "${{ github.workspace }}/packages" + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + + - name: Install Python requirements + run: pip install boto3 packaging piprepo setuptools + + # Note: we could fetch "all" artifacts if we wanted to include more files + - name: Fetch artifacts + env: + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + run: | + python ./build_tools/fetch_artifacts.py \ + --run-github-repo=${{ inputs.artifact_github_repo }} \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir="${{ env.ARTIFACTS_DIR }}" \ + _dev_ _lib_ _run_ + + - name: Build Python packages + run: | + python ./build_tools/build_python_packages.py \ + --artifact-dir="${{ env.ARTIFACTS_DIR }}" \ + --dest-dir="${{ env.PACKAGES_DIR }}" \ + --version="${{ inputs.package_version }}" + + - name: Inspect Python packages + run: | + ls -la "${{ env.PACKAGES_DIR }}" + + - name: Sanity check Python packages + run: | + piprepo build "${{ env.PACKAGES_DIR }}/dist" + pip install rocm[libraries,devel]==${{ inputs.package_version }} \ + --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/" + rocm-sdk test + + # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket diff --git a/.github/workflows/build_windows_pytorch_wheels.yml b/.github/workflows/build_windows_pytorch_wheels.yml new file mode 100644 index 0000000000000..aa1fc5d43a75f --- /dev/null +++ b/.github/workflows/build_windows_pytorch_wheels.yml @@ -0,0 +1,357 @@ +name: Build Windows PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx1151 + python_version: + required: true + type: string + default: "3.12" + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + default: "release/2.7" + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + default: "rocm_2.7" + +permissions: + id-token: write + contents: read + +jobs: + build_pytorch_wheels: + name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }} + env: + CHECKOUT_ROOT: B:/src + OUTPUT_DIR: ${{ github.workspace }}/output + # Note the \ here instead of /. This should be used from 'cmd' not 'bash'! + PACKAGE_DIST_DIR: ${{ github.workspace }}\output\packages\dist + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + optional_build_prod_arguments: "" + outputs: + cp_version: ${{ env.cp_version }} + # The following are python package versions produced by the build. The + # exact versions will depend on workflow inputs and the underlying code. + # For example: + # Inputs + # rocm_version : 7.10.0a20251120 + # pytorch_git_ref : release/2.9 + # Outputs + # torch_version : 2.9.1+rocm7.10.0a20251120 + # torchaudio_version : 2.9.0+rocm7.10.0a20251120 + # torchvision_version: 0.24.0+rocm7.10.0a20251120 + # Future jobs can use these version outputs to identify newly built + # packages, for example via `pip install torch==${TORCH_VERSION}`. + torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }} + torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }} + torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }} + + defaults: + run: + # Note: there are mixed uses of 'bash' (this default) and 'cmd' below + shell: bash + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + # TODO(amd-justchen): share with build_windows_artifacts.yml. Include in VM image? Dockerfile? + - name: Install requirements + run: | + choco install --no-progress -y ninja --version 1.13.1 + choco install --no-progress -y awscli + echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + + # After other installs, so MSVC get priority in the PATH. + - name: Configure MSVC + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + # Checkout nightly sources from https://github.com/pytorch/pytorch + # TODO: switch to 'nightly' to match our Linux workflows? + - name: Checkout PyTorch source repos (nightly branch) + if: ${{ inputs.pytorch_git_ref == 'nightly' }} + run: | + git config --global core.longpaths true + python ./external-builds/pytorch/pytorch_torch_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --repo-hashtag nightly + python ./external-builds/pytorch/pytorch_audio_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \ + --repo-hashtag nightly + python ./external-builds/pytorch/pytorch_vision_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \ + --repo-hashtag nightly + + # Checkout stable sources from https://github.com/ROCm/pytorch + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ inputs.pytorch_git_ref != 'nightly' }} + run: | + git config --global core.longpaths true + python ./external-builds/pytorch/pytorch_torch_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --gitrepo-origin https://github.com/ROCm/pytorch.git \ + --repo-hashtag ${{ inputs.pytorch_git_ref }} \ + --patchset ${{ inputs.pytorch_patchset }} + python ./external-builds/pytorch/pytorch_audio_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \ + --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --require-related-commit + python ./external-builds/pytorch/pytorch_vision_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \ + --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --require-related-commit + + - name: Determine optional arguments passed to `build_prod_wheels.py` + if: ${{ inputs.rocm_version }} + run: | + pip install packaging + python build_tools/github_actions/determine_version.py \ + --rocm-version ${{ inputs.rocm_version }} + + - name: Build PyTorch Wheels + id: build-pytorch-wheels + # Using 'cmd' here is load bearing! There are configuration issues when + # run under 'bash': https://github.com/ROCm/TheRock/issues/827#issuecomment-3025858800 + shell: cmd + run: | + echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}" + python ./external-builds/pytorch/build_prod_wheels.py ^ + build ^ + --install-rocm ^ + --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" ^ + --pytorch-dir ${{ env.CHECKOUT_ROOT }}/torch ^ + --pytorch-audio-dir ${{ env.CHECKOUT_ROOT }}/audio ^ + --pytorch-vision-dir ${{ env.CHECKOUT_ROOT }}/vision ^ + --enable-pytorch-flash-attention-windows ^ + --clean ^ + --output-dir ${{ env.PACKAGE_DIST_DIR }} ^ + ${{ env.optional_build_prod_arguments }} + python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }} + + - name: Sanity Check Wheel + shell: cmd + run: | + python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + special-characters-workaround: true + + - name: Upload wheels to S3 staging + if: ${{ github.repository_owner == 'ROCm' }} + # Using 'cmd' here since PACKAGE_DIST_DIR uses \ in paths instead of / + shell: cmd + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ ^ + s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ ^ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}" + shell: cmd + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "windows" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_pytorch_wheels: + name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }} + needs: [build_pytorch_wheels, generate_target_to_run] + uses: ./.github/workflows/test_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + python_version: ${{ inputs.python_version }} + torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }} + pytorch_git_ref: ${{ inputs.pytorch_git_ref }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + upload_pytorch_wheels: + name: Release PyTorch Wheels to S3 + needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}" + TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}" + TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}" + TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}" + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + special-characters-workaround: true + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }} + TEST_RESULT: ${{ needs.test_pytorch_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy PyTorch wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "torch-${TORCH_VERSION}-${CP_VERSION}-win_amd64.whl" \ + --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-win_amd64.whl" \ + --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-win_amd64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000..d131226a8d3b8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,138 @@ +# This CI workflow is triggered by: +# - push to main branch +# - pull request +# - workflow dispatch +# +# For pull requests, we run default builds and tests for: +# - Linux: gfx94X gfx110X +# - Windows: gfx110X +# If you want to trigger jobs for additional targets, please add a defined label (ex: gfx120X-linux) to the pull request +# +# For push to main branch, all AMD families will built and tested from `amdgpu_family_matrix.py`. +# +# Note: If a test machine is not available for a specific AMD GPU family in `amdgpu_family_matrix.py`, tests will be skipped. + +name: CI + +on: + push: + branches: + - main + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + pull_request: + types: + - labeled + - opened + - synchronize + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + # build_python_packages: + # name: Build Python Packages + # uses: ./.github/workflows/build_python_packages.yml + + ci_summary: + name: CI Summary + if: always() + needs: + - setup + - linux_build_and_test + runs-on: ubuntu-24.04 + steps: + - name: Output failed jobs + run: | + echo '${{ toJson(needs) }}' + + # Build a list of failed jobs, but ignore those marked continue-on-error + FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ + | jq --raw-output ' + to_entries + | map(select( + (.value.result != "success" and .value.result != "skipped") + and (.value.outputs.continue_on_error | not) + )) + | map(.key) + | join(",") + ' \ + )" + + if [[ -n "${FAILED_JOBS}" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + else + echo "All required jobs succeeded (continue-on-error jobs ignored)." + fi diff --git a/.github/workflows/ci_asan.yml b/.github/workflows/ci_asan.yml new file mode 100644 index 0000000000000..4da6ce0b14d11 --- /dev/null +++ b/.github/workflows/ci_asan.yml @@ -0,0 +1,67 @@ +name: CI ASAN + +on: + schedule: + - cron: "0 2 * * *" # Runs nightly at 2 AM UTC + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "asan" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml new file mode 100644 index 0000000000000..e9522b323870d --- /dev/null +++ b/.github/workflows/ci_linux.yml @@ -0,0 +1,108 @@ +name: CI - Linux + +on: + workflow_call: + inputs: + artifact_group: + type: string + amdgpu_families: + type: string + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + test_runs_on: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + sanity_check_only_for_family: + type: boolean + +permissions: + contents: read + +jobs: + build_portable_linux_artifacts: + name: Build Artifacts + if: ${{ inputs.use_prebuilt_artifacts == 'false' }} + uses: ./.github/workflows/build_portable_linux_artifacts.yml + secrets: inherit + with: + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} + amdgpu_families: ${{ inputs.amdgpu_families }} + build_variant_label: ${{ inputs.build_variant_label }} + build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_variant_suffix: ${{ inputs.build_variant_suffix }} + expect_failure: ${{ inputs.expect_failure }} + permissions: + contents: read + id-token: write + + # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here? + # I don't want to copy/paste this condition and special case plumbing + # through multiple workflows. All the packaging and testing workflows need + # to know is what artifact run id to use. That could be the current + # (implicit) run id, or it could be an explicit run id. + # How about having the "build artifacts" job run as a passthrough? + + test_linux_artifacts: + needs: [build_portable_linux_artifacts] + name: Test Artifacts + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + # If we are expecting a build failure, do not run tests to save machine capacity + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/test_artifacts.yml + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + artifact_run_id: ${{ inputs.artifact_run_id }} + test_type: ${{ inputs.test_type }} + test_labels: ${{ inputs.test_labels }} + sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }} + + build_portable_linux_python_packages: + needs: [build_portable_linux_artifacts] + name: Build Python + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/build_portable_linux_python_packages.yml + with: + artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} diff --git a/.github/workflows/ci_nightly.yml b/.github/workflows/ci_nightly.yml new file mode 100644 index 0000000000000..e15f5e887a077 --- /dev/null +++ b/.github/workflows/ci_nightly.yml @@ -0,0 +1,124 @@ +# This CI workflow is triggered by: +# - scheduled run +# +# In the scheduled run, we run all targets from amdgpu_family_matrix.py and amdgpu_family_matrix_xfail.py +# As some of these builds are xfail, we allow errors to occur with `continue-on-error`, where the job will fail but the workflow is green + +name: CI Nightly + +on: + # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger + schedule: + - cron: "0 2 * * *" # Runs nightly at 2 AM UTC + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:'" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + windows_build_and_test: + name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.windows_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }} + uses: ./.github/workflows/ci_windows.yml + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.windows_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.windows_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + # build_python_packages: + # name: Build Python Packages + # uses: ./.github/workflows/build_python_packages.yml diff --git a/.github/workflows/ci_weekly.yml b/.github/workflows/ci_weekly.yml new file mode 100644 index 0000000000000..9570a74f3f7e1 --- /dev/null +++ b/.github/workflows/ci_weekly.yml @@ -0,0 +1,14 @@ +name: WIP Placeholder CI Weekly + +on: + # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger + # schedule: + # - cron: "0 3 * * 0" # Runs weekly at 3 AM UTC Sundays + workflow_dispatch: + + +jobs: + donothing: + runs-on: ubuntu-latest + steps: + - run: echo "Skipped" diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml new file mode 100644 index 0000000000000..536463a2c4e43 --- /dev/null +++ b/.github/workflows/ci_windows.yml @@ -0,0 +1,108 @@ +name: CI - Windows + +on: + workflow_call: + inputs: + artifact_group: + type: string + amdgpu_families: + type: string + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + test_runs_on: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + sanity_check_only_for_family: + type: boolean + +permissions: + contents: read + +jobs: + build_windows_artifacts: + name: Build Artifacts + if: ${{ inputs.use_prebuilt_artifacts == 'false' }} + uses: ./.github/workflows/build_windows_artifacts.yml + secrets: inherit + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + build_variant_label: ${{ inputs.build_variant_label }} + build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_variant_suffix: ${{ inputs.build_variant_suffix }} + package_version: ${{ inputs.rocm_package_version }} + expect_failure: ${{ inputs.expect_failure }} + permissions: + contents: read + id-token: write + + # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here? + # I don't want to copy/paste this condition and special case plumbing + # through multiple workflows. All the packaging and testing workflows need + # to know is what artifact run id to use. That could be the current + # (implicit) run id, or it could be an explicit run id. + # How about having the "build artifacts" job run as a passthrough? + + test_windows_artifacts: + needs: [build_windows_artifacts] + name: Test Artifacts + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + # If we are expecting a build failure, do not run tests to save machine capacity + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/test_artifacts.yml + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + artifact_run_id: ${{ inputs.artifact_run_id }} + test_type: ${{ inputs.test_type }} + test_labels: ${{ inputs.test_labels }} + sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }} + + build_windows_python_packages: + needs: [build_windows_artifacts] + name: Build Python + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/build_windows_python_packages.yml + with: + artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} diff --git a/.github/workflows/copy_release.yml b/.github/workflows/copy_release.yml new file mode 100644 index 0000000000000..fd4a49dbe4993 --- /dev/null +++ b/.github/workflows/copy_release.yml @@ -0,0 +1,101 @@ +name: Copy release to dev bucket + +on: + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to copy, e.g. 7.0.0rc20250912 + type: string + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + type: choice + options: + - 3.11 + - 3.12 + - 3.13 + default: 3.12 + include_torch: + type: boolean + default: false + sourcesubdir: + type: choice + options: + - v2 + - v2-staging + destsubdir: + type: string + default: v2 + sourcebucket: + type: choice + options: + - nightly + - dev + default: nightly + destbucket: + type: choice + options: + - dev + - nightly + default: dev +permissions: + contents: read + +jobs: + copy_python_packages: + name: Copy ${{ inputs.sourcebucket }} ${{ inputs.sourcesubdir }} -> ${{ inputs.destbucket }} ${{ inputs.destsubdir }} | ${{ inputs.amdgpu_family }} | rocm ${{ inputs.rocm_version }} | py ${{ inputs.python_version }} + runs-on: ubuntu-24.04 + permissions: + id-token: write + + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install the AWS tool + run: ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.destbucket }}-releases + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Copy ROCm packages between S3 buckets + run: | + aws s3 cp \ + s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \ + s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "rocm*${{ inputs.rocm_version }}*" + + - name: Copy torch wheels between S3 buckets + if: ${{ inputs.include_torch }} + run: | + aws s3 cp \ + s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \ + s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*torch*${{ inputs.rocm_version }}*${{ env.cp_version }}*" + + - name: (Re-)Generate Python package release index + env: + S3_BUCKET_PY: "therock-${{ inputs.destbucket }}-python" + CUSTOM_PREFIX: "${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${CUSTOM_PREFIX} diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml new file mode 100644 index 0000000000000..acffe43062f43 --- /dev/null +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -0,0 +1,785 @@ +# Multi-Arch Build - Sharded Pipeline for Linux +# +# This workflow builds TheRock in stages: +# 1. foundation (generic) - sysdeps, base +# 2. compiler-runtime (generic) - compiler, runtimes, profiler-core +# 3. math-libs (per-arch) - BLAS, FFT, etc. +# 4. comm-libs (per-arch) - RCCL (parallel to math-libs) +# 5. dctools-core (generic) - RDC (parallel to math-libs) +# 6. profiler-apps (generic) - rocprofiler-systems (parallel to math-libs) +# 7. media (generic) - sysdeps-amd-mesa, rocdecode (todo), rocjpeg (todo) +# +# Artifacts flow between stages via S3 using the artifact_manager.py tool. + +name: Multi-Arch Build (Linux) + +on: + workflow_call: + inputs: + artifact_group: + type: string + matrix_per_family_json: + type: string + description: "JSON array of {amdgpu_family, test-runs-on} objects for per-arch stages" + dist_amdgpu_families: + type: string + description: "Semicolon-separated list of all GPU families for dist targets" + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + +permissions: + contents: read + +env: + CONTAINER_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf + CACHE_DIR: ${{ github.workspace }}/.container-cache + TEATIME_FORCE_INTERACTIVE: 0 + +jobs: + # ========================================================================== + # STAGE: foundation (generic) + # ========================================================================== + foundation: + name: Stage - Foundation + # Always run all stages + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: foundation + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: compiler-runtime (generic) + # ========================================================================== + compiler-runtime: + name: Stage - Compiler Runtime + needs: foundation + runs-on: azure-linux-scale-rocm + timeout-minutes: 480 # 8 hours (compiler is big) + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: compiler-runtime + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: math-libs (per-arch) + # ========================================================================== + math-libs: + name: Stage - Math Libs (${{ matrix.family_info.amdgpu_family }}) + needs: compiler-runtime + strategy: + fail-fast: false + matrix: + family_info: ${{ fromJSON(inputs.matrix_per_family_json) }} + runs-on: azure-linux-scale-rocm + timeout-minutes: 480 # 8 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: math-libs + AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --build-dir build + + # ========================================================================== + # STAGE: comm-libs (per-arch, parallel to math-libs) + # ========================================================================== + comm-libs: + name: Stage - Comm Libs (${{ matrix.family_info.amdgpu_family }}) + needs: compiler-runtime + strategy: + fail-fast: false + matrix: + family_info: ${{ fromJSON(inputs.matrix_per_family_json) }} + runs-on: azure-linux-scale-rocm + timeout-minutes: 240 # 4 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: comm-libs + AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --build-dir build + + # ========================================================================== + # STAGE: dctools-core (generic, parallel to math-libs) + # ========================================================================== + dctools-core: + name: Stage - DC Tools Core + needs: compiler-runtime + runs-on: azure-linux-scale-rocm + timeout-minutes: 120 # 2 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: dctools-core + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: profiler-apps (generic, parallel to math-libs) + # ========================================================================== + profiler-apps: + name: Stage - Profiler Apps + needs: compiler-runtime + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: profiler-apps + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: media (generic) + # ========================================================================== + media: + name: Stage - Media + needs: foundation + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: media + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build diff --git a/.github/workflows/multi_arch_ci.yml b/.github/workflows/multi_arch_ci.yml new file mode 100644 index 0000000000000..73a6a74b9df2c --- /dev/null +++ b/.github/workflows/multi_arch_ci.yml @@ -0,0 +1,142 @@ +# Multi-Arch CI +# +# This is a staging workflow for the sharded multi-arch build pipeline. +# It mirrors ci.yml but uses multi_arch_build_portable_linux.yml instead of +# ci_linux.yml. Once validated, ci.yml will be updated to use the multi-arch +# sub-workflows directly. + +name: Multi-Arch CI + +on: + push: + branches: + # While we are iterating on testing. + - 'multi_arch/**' + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + # pull_request: + # types: + # - labeled + # - opened + # - synchronize + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + multi_arch: true + + linux_build_and_test: + name: Linux::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/multi_arch_build_portable_linux.yml + secrets: inherit + with: + matrix_per_family_json: ${{ matrix.variant.matrix_per_family_json }} + dist_amdgpu_families: ${{ matrix.variant.dist_amdgpu_families }} + artifact_group: ${{ matrix.variant.artifact_group }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + permissions: + contents: read + id-token: write + + # TODO: Add windows_build_and_test when ready + # windows_build_and_test: + # name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + # needs: setup + # if: >- + # ${{ + # needs.setup.outputs.windows_variants != '[]' && + # needs.setup.outputs.enable_build_jobs == 'true' + # }} + # strategy: + # fail-fast: false + # matrix: + # variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }} + # uses: ./.github/workflows/ci_windows.yml + # ... + + ci_summary: + name: CI Summary + if: always() + needs: + - setup + - linux_build_and_test + runs-on: ubuntu-24.04 + steps: + - name: Output failed jobs + run: | + # Build a list of failed jobs, but ignore those marked continue-on-error + FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ + | jq --raw-output ' + to_entries + | map(select( + (.value.result != "success" and .value.result != "skipped") + and (.value.outputs.continue_on_error | not) + )) + | map(.key) + | join(",") + ' \ + )" + + if [[ -n "${FAILED_JOBS}" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + else + echo "All required jobs succeeded (continue-on-error jobs ignored)." + fi diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..a129cad3f0c1a --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,21 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + pre-commit: + runs-on: ubuntu-24.04 + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + fetch-depth: 10 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 diff --git a/.github/workflows/publish_build_manylinux_rccl_x86_64.yml b/.github/workflows/publish_build_manylinux_rccl_x86_64.yml new file mode 100644 index 0000000000000..5e9c22824da45 --- /dev/null +++ b/.github/workflows/publish_build_manylinux_rccl_x86_64.yml @@ -0,0 +1,21 @@ +name: Publish build_manylinux_rccl_x86_64 images +on: + workflow_dispatch: + push: + branches: + - 'main' + - 'stage/docker/**' + paths: + - dockerfiles/build_manylinux_rccl_x86_64*.Dockerfile + - .github/workflows/publish_build_manylinux_rccl_x86_64.yml + +permissions: + contents: read + packages: write + +jobs: + publish_build_manylinux_x86_64: + uses: ./.github/workflows/publish_dockerfile.yml + with: + DOCKER_FILE_NAME: build_manylinux_rccl_x86_64 + DOCKER_IMAGE_NAME: therock_build_manylinux_rccl_x86_64 diff --git a/.github/workflows/publish_build_manylinux_x86_64.yml b/.github/workflows/publish_build_manylinux_x86_64.yml new file mode 100644 index 0000000000000..4501d1fe776db --- /dev/null +++ b/.github/workflows/publish_build_manylinux_x86_64.yml @@ -0,0 +1,21 @@ +name: Publish build_manylinux_x86_64 images +on: + workflow_dispatch: + push: + branches: + - 'main' + - 'stage/docker/**' + paths: + - dockerfiles/build_manylinux_x86_64*.Dockerfile + - .github/workflows/publish_build_manylinux_x86_64.yml + +permissions: + contents: read + packages: write + +jobs: + publish_build_manylinux_x86_64: + uses: ./.github/workflows/publish_dockerfile.yml + with: + DOCKER_FILE_NAME: build_manylinux_x86_64 + DOCKER_IMAGE_NAME: therock_build_manylinux_x86_64 diff --git a/.github/workflows/publish_dockerfile.yml b/.github/workflows/publish_dockerfile.yml new file mode 100644 index 0000000000000..bb725e88a8cd0 --- /dev/null +++ b/.github/workflows/publish_dockerfile.yml @@ -0,0 +1,70 @@ +name: Publish TheRock Docker image +on: + workflow_call: + inputs: + DOCKER_FILE_NAME: + type: string + DOCKER_IMAGE_NAME: + type: string + +jobs: + build-and-push-image: + runs-on: ubuntu-24.04 + env: + REGISTRY: ghcr.io + IMAGE_NAME: ROCm/${{ inputs.DOCKER_IMAGE_NAME }} + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Log in to the Container registry + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Sanitization of tag names is done automatically by the metadata-action + - name: Determine Docker tag + id: tag + run: | + ref="${{ github.ref_name }}" + if [[ "$ref" == stage/docker/* ]]; then + suffix="${ref#stage/docker/}" + echo "TAG_SUFFIX=stage-${suffix}" >> "$GITHUB_OUTPUT" + elif [[ "$ref" == "main" ]]; then + echo "TAG_SUFFIX=latest" >> "$GITHUB_OUTPUT" + else + echo "TAG_SUFFIX=${ref}" >> "$GITHUB_OUTPUT" + fi + + # Adds extra tags to the image, with the default tags from https://github.com/docker/metadata-action#tags-input + # The custom tag is for the branches prefixed with `stage/docker/`. + # For the default branch (i.e., main), the default behaviour remains and is labelled `latest`. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # v5.10.0 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=schedule + type=ref,event=branch,enable={{is_default_branch}} + type=ref,event=tag + type=ref,event=pr + type=raw,value=${{ steps.tag.outputs.TAG_SUFFIX }} + + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push Docker image + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + context: dockerfiles/ + file: dockerfiles/${{ inputs.DOCKER_FILE_NAME }}.Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml b/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml new file mode 100644 index 0000000000000..ca562fc899e62 --- /dev/null +++ b/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml @@ -0,0 +1,21 @@ +name: Publish no_rocm_image_ubuntu24_04 images +on: + workflow_dispatch: + push: + branches: + - 'main' + - 'stage/docker/**' + paths: + - dockerfiles/no_rocm_image_ubuntu24_04*.Dockerfile + - .github/workflows/publish_no_rocm_image_ubuntu24_04.yml + +permissions: + contents: read + packages: write + +jobs: + publish_no_rocm_image_ubuntu24_04: + uses: ./.github/workflows/publish_dockerfile.yml + with: + DOCKER_FILE_NAME: no_rocm_image_ubuntu24_04 + DOCKER_IMAGE_NAME: no_rocm_image_ubuntu24_04 diff --git a/.github/workflows/release_native_linux_packages.yml b/.github/workflows/release_native_linux_packages.yml new file mode 100644 index 0000000000000..50e4dd2972797 --- /dev/null +++ b/.github/workflows/release_native_linux_packages.yml @@ -0,0 +1,67 @@ +name: Release native Linux Packages + +on: + workflow_call: + inputs: + amdgpu_family: + description: gfx arch for creating the s3 bucket url + required: true + type: string + artifact_run_id: + description: workflow run id to download the artifacts from + type: string + rocm_version: + description: ROCm version to append to the package( Like 8.0.0 or 8.1.0). + required: true + type: string + package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: string + package_suffix: + description: The suffix to be added to package name(build_no or master or rc or combiantion). + required: true + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: string + default: gfx94X-dcgpu + artifact_run_id: + description: workflow run id to download the artifacts from + type: string + rocm_version: + description: ROCm version to append to the package( Like 7.0.0 or 7.1.0) + type: string + default: "0.0.1" + package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: choice + options: + - rpm + - deb + default: "rpm" + package_suffix: + description: The suffix to be added to package name(build_no or master or rc or combiantion). + type: string + default: "test" + +permissions: + id-token: write + contents: read + +run-name: Release native Linux packages (${{ inputs.amdgpu_family }}, ${{ inputs.rocm_version }}, ${{ inputs.package_type }}, ${{ inputs.package_suffix }}) + +jobs: + release: + name: Release Native Linux Package + strategy: + fail-fast: false + uses: ./.github/workflows/build_native_linux_packages.yml + with: + artifact_group: ${{ inputs.amdgpu_family }} + artifact_run_id: ${{ inputs.artifact_run_id }} + rocm_version: ${{ inputs.rocm_version }} + native_package_type: ${{ inputs.package_type }} + package_suffix: ${{ inputs.package_suffix }} diff --git a/.github/workflows/release_portable_linux_packages.yml b/.github/workflows/release_portable_linux_packages.yml new file mode 100644 index 0000000000000..133f7403de1d1 --- /dev/null +++ b/.github/workflows/release_portable_linux_packages.yml @@ -0,0 +1,380 @@ +name: Release portable Linux packages + +on: + # Trigger from another workflow (typically to build dev packages and then test them) + workflow_call: + inputs: + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + package_suffix: + type: string + s3_subdir: + description: "Subdirectory to push the packages" + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the packages" + type: string + default: "v2-staging" + families: + description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`" + type: string + prerelease_version: + description: "(Optional) Number of the prerelease" + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + # Trigger manually (typically to test the workflow or manually build a release [candidate]) + workflow_dispatch: + inputs: + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + package_suffix: + type: string + s3_subdir: + description: "Subdirectory to push the packages" + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the packages" + type: string + default: "v2-staging" + families: + description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`" + type: string + prerelease_version: + description: "(Optional) Number of the prerelease" + type: string + # Trigger on a schedule to build nightly release candidates. + schedule: + # Runs at 04:00 AM UTC, which is 8:00 PM PST (UTC-8) + - cron: '0 04 * * *' + +permissions: + contents: read + +run-name: Release portable Linux packages (${{ inputs.families || 'default' }}, ${{ inputs.release_type || 'nightly' }}) + +jobs: + setup_metadata: + if: ${{ github.repository_owner == 'ROCm' || github.event_name != 'schedule' }} + runs-on: ubuntu-24.04 + env: + release_type: ${{ inputs.release_type || 'nightly' }} + outputs: + version: ${{ steps.rocm_package_version.outputs.rocm_package_version }} + rpm_version: ${{ steps.rocm_native_package_version.outputs.rocm_rpm_package_version }} + deb_version: ${{ steps.rocm_native_package_version.outputs.rocm_deb_package_version }} + release_type: ${{ env.release_type }} + package_targets: ${{ steps.configure.outputs.package_targets }} + cloudfront_url: ${{ steps.release_information.outputs.cloudfront_url }} + cloudfront_staging_url: ${{ steps.release_information.outputs.cloudfront_staging_url }} + s3_subdir_tar: ${{ steps.release_information.outputs.s3_subdir_tar }} + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Compute package version + id: rocm_package_version + run: | + python ./build_tools/compute_rocm_package_version.py \ + --release-type=${{ env.release_type }} \ + --prerelease-version=${{ inputs.prerelease_version }} + + - name: Compute native package version + id: rocm_native_package_version + run: | + # Compute rpm package version + # This sets the 'rocm_rpm_package_version' output + python ./build_tools/compute_rocm_package_version.py \ + --release-type=${{ env.release_type }} \ + --prerelease-version=${{ inputs.prerelease_version }} \ + --package-type="rpm" + # Compute debian package version + # This sets the 'rocm_deb_package_version' output + python ./build_tools/compute_rocm_package_version.py \ + --release-type=${{ env.release_type }} \ + --prerelease-version=${{ inputs.prerelease_version }} \ + --package-type="deb" + + - name: Set variables for nightly release + if: ${{ env.release_type == 'nightly' }} + run: | + echo "tmp_cloudfront_url=https://rocm.nightlies.amd.com/v2" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.nightlies.amd.com/v2-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV + + - name: Set variables for prerelease + if: ${{ env.release_type == 'prerelease' }} + run: | + echo "tmp_cloudfront_url=https://rocm.prereleases.amd.com/whl" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.prereleases.amd.com/whl-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=v3/tarball/" >> $GITHUB_ENV + + - name: Set variables for development release + if: ${{ env.release_type == 'dev' }} + run: | + echo "tmp_cloudfront_url=https://rocm.devreleases.amd.com/v2" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.devreleases.amd.com/v2-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV + + - name: Generate release information + id: release_information + run: | + echo "cloudfront_url=${tmp_cloudfront_url}" >> $GITHUB_OUTPUT + echo "cloudfront_staging_url=${tmp_cloudfront_staging_url}" >> $GITHUB_OUTPUT + echo "s3_subdir_tar=${tmp_s3_subdir_tar}" >> $GITHUB_OUTPUT + + - name: Generating package target matrix + id: configure + env: + AMDGPU_FAMILIES: ${{ inputs.families }} + THEROCK_PACKAGE_PLATFORM: "linux" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/fetch_package_targets.py + + portable_linux_packages: + name: ${{ matrix.target_bundle.amdgpu_family }}::Build Portable Linux + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + continue-on-error: ${{ matrix.target_bundle.expect_failure == true }} # for GPU families that are flaky, we mark as xfail + timeout-minutes: 720 # 12 hour timeout + needs: [setup_metadata] + permissions: + contents: write + actions: write # Added permission to trigger workflows + id-token: write # Added permission for AWS S3 upload + strategy: + fail-fast: false + matrix: + target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }} + env: + TEATIME_LABEL_GH_GROUP: 1 + OUTPUT_DIR: ${{ github.workspace }}/output + BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + DIST_ARCHIVE: "${{ github.workspace }}/output/therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz" + FILE_NAME: "therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz" + RELEASE_TYPE: "${{ needs.setup_metadata.outputs.release_type }}" + S3_BUCKET_TAR: "therock-${{ needs.setup_metadata.outputs.release_type }}-tarball" + S3_SUBDIR_TAR: ${{ needs.setup_metadata.outputs.s3_subdir_tar }} + S3_BUCKET_PY: "therock-${{ needs.setup_metadata.outputs.release_type }}-python" + S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }} + S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }} + MANYLINUX: 1 + + steps: + - name: "Checking out repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Enable cache + uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: ${{ env.OUTPUT_DIR }}/caches + key: portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }} + restore-keys: | + portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}- + + - name: Install the AWS tool + run: ./dockerfiles/install_awscli.sh + + - name: Fetch sources + timeout-minutes: 30 + run: | + # Prefetch docker container in background. + docker pull ${{ env.BUILD_IMAGE }} & + ./build_tools/fetch_sources.py --jobs 10 + wait + + - name: Build Projects + run: | + ./build_tools/linux_portable_build.py \ + --image=${{ env.BUILD_IMAGE }} \ + --output-dir=${{ env.OUTPUT_DIR }} \ + -- \ + "-DTHEROCK_AMDGPU_FAMILIES=${{ matrix.target_bundle.amdgpu_family }}" + cd ${{ env.OUTPUT_DIR }}/build/dist/rocm + echo "Building ${{ env.DIST_ARCHIVE }}" + tar cfz "${{ env.DIST_ARCHIVE }}" . + + - name: Build Python Packages + run: | + ./build_tools/linux_portable_build.py \ + --image=${{ env.BUILD_IMAGE }} \ + --output-dir=${{ env.OUTPUT_DIR }}/packages \ + --build-python-only \ + --artifact-dir=${{ env.OUTPUT_DIR }}/build/artifacts \ + -- \ + "--version=${{ needs.setup_metadata.outputs.version }}" + + - name: Grant ownership over output directory + if: ${{ !cancelled() }} + run: | + sudo chown -R $(whoami) ${{ env.OUTPUT_DIR }} + + - name: Build Report + if: ${{ !cancelled() }} + run: | + echo "Full SDK du:" + echo "------------" + du -h -d 1 ${{ env.OUTPUT_DIR }}/build/dist/rocm + + # Analyze ninja build log to generate per-component timing report + - name: Analyze Build Times + if: ${{ !cancelled() }} + run: | + python3 build_tools/analyze_build_times.py --build-dir ${{ env.OUTPUT_DIR }}/build + + - name: Configure AWS Credentials + if: ${{ github.repository_owner == 'ROCm' && !cancelled() }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ env.RELEASE_TYPE }} + + - name: Post Build Upload + if: ${{ github.repository_owner == 'ROCm' && !cancelled() }} + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group "${{ matrix.target_bundle.amdgpu_family }}" \ + --build-dir ${{ env.OUTPUT_DIR }}/build \ + --upload \ + --job-status ${{ job.status }} + + - name: Upload Releases to staging S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.OUTPUT_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \ + --recursive --no-follow-symlinks \ + --exclude "*" \ + --include "*.whl" \ + --include "*.tar.gz" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variable to be set for `manage.py` + CUSTOM_PREFIX: "${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + ## TODO: Restrict uploading to the non-staging S3 directory until ROCm sanity checks and all validation tests have successfully passed. + - name: Upload Releases to S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.DIST_ARCHIVE }} s3://${{ env.S3_BUCKET_TAR }}/${{ env.S3_SUBDIR_TAR }} + aws s3 cp ${{ env.OUTPUT_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \ + --recursive --no-follow-symlinks \ + --exclude "*" \ + --include "*.whl" \ + --include "*.tar.gz" + + - name: (Re-)Generate release index pages + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variable to be set for `manage.py` + CUSTOM_PREFIX: "${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + python ./build_tools/index_generation_s3_tar.py \ + --bucket ${{ env.S3_BUCKET_TAR }} \ + --directory ${{ env.S3_SUBDIR_TAR }} \ + --upload + + - name: Trigger building PyTorch wheels + if: ${{ github.repository_owner == 'ROCm' && matrix.target_bundle.expect_pytorch_failure == false }} + uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4 + with: + workflow: release_portable_linux_pytorch_wheels.yml + inputs: | + { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}", + "release_type": "${{ env.RELEASE_TYPE }}", + "s3_subdir": "${{ env.S3_SUBDIR }}", + "s3_staging_subdir": "${{ env.S3_STAGING_SUBDIR }}", + "cloudfront_url": "${{ needs.setup_metadata.outputs.cloudfront_url }}", + "cloudfront_staging_url": "${{ needs.setup_metadata.outputs.cloudfront_staging_url }}", + "rocm_version": "${{ needs.setup_metadata.outputs.version }}", + "ref": "${{ inputs.ref || '' }}" + } + + - name: URL-encode .tar URL + # TODO: Enable JAX wheels for prereleases + if: ${{ env.RELEASE_TYPE != 'prerelease' }} + id: url-encode-tar + run: python -c "from urllib.parse import quote; print('tar_url=https://therock-${{ env.RELEASE_TYPE }}-tarball.s3.amazonaws.com/' + quote('therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz'))" >> ${GITHUB_OUTPUT} + + - name: Trigger build JAX wheels + # TODO: Enable JAX wheels for prereleases + if: ${{ env.RELEASE_TYPE != 'prerelease' && github.repository_owner == 'ROCm' }} + uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4 + with: + workflow: build_linux_jax_wheels.yml + inputs: | + { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}", + "python_version": "3.12", + "release_type": "${{ env.RELEASE_TYPE }}", + "s3_subdir": "${{ env.S3_STAGING_SUBDIR }}", + "rocm_version": "${{ needs.setup_metadata.outputs.version }}", + "tar_url": "${{ steps.url-encode-tar.outputs.tar_url }}" + } + + - name: Trigger build native rpm package + if: ${{ github.repository_owner == 'ROCm' }} + uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4 + with: + workflow: build_native_linux_packages.yml + inputs: | + { "artifact_group": "${{ matrix.target_bundle.amdgpu_family }}", + "rocm_version": "${{ needs.setup_metadata.outputs.rpm_version }}", + "release_type": "${{ env.RELEASE_TYPE }}", + "artifact_run_id": "${{ github.run_id }}", + "native_package_type": "rpm" + } + + - name: Trigger build native debian package + if: ${{ github.repository_owner == 'ROCm' }} + uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4 + with: + workflow: build_native_linux_packages.yml + inputs: | + { "artifact_group": "${{ matrix.target_bundle.amdgpu_family }}", + "rocm_version": "${{ needs.setup_metadata.outputs.deb_version }}", + "release_type": "${{ env.RELEASE_TYPE }}", + "artifact_run_id": "${{ github.run_id }}", + "native_package_type": "deb" + } + + - name: Save cache + uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + if: ${{ !cancelled() }} + with: + path: ${{ env.OUTPUT_DIR }}/caches + key: portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }} diff --git a/.github/workflows/release_portable_linux_pytorch_wheels.yml b/.github/workflows/release_portable_linux_pytorch_wheels.yml new file mode 100644 index 0000000000000..87b52de133899 --- /dev/null +++ b/.github/workflows/release_portable_linux_pytorch_wheels.yml @@ -0,0 +1,114 @@ +name: Release portable Linux PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: Staging subdirectory to push the wheels for test + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the wheels for test" + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + default: '' + +permissions: + id-token: write + contents: read + +run-name: Release portable Linux PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.release_type }}, ${{ inputs.rocm_version }}) + +jobs: + release: + name: Release | ${{ inputs.amdgpu_family }} | py ${{ matrix.python_version }} | torch ${{ matrix.pytorch_git_ref }} + strategy: + fail-fast: false + matrix: + python_version: ["3.11", "3.12", "3.13"] + pytorch_git_ref: ["release/2.7", "release/2.8", "release/2.9", "nightly"] + include: + - pytorch_git_ref: release/2.7 + pytorch_patchset: rocm_2.7 + - pytorch_git_ref: release/2.8 + pytorch_patchset: rocm_2.8 + - pytorch_git_ref: release/2.9 + pytorch_patchset: rocm_2.9 + - pytorch_git_ref: nightly + pytorch_patchset: nightly + + uses: ./.github/workflows/build_portable_linux_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + python_version: ${{ matrix.python_version }} + release_type: ${{ inputs.release_type }} + s3_subdir: ${{ inputs.s3_subdir }} + s3_staging_subdir: ${{ inputs.s3_staging_subdir }} + cloudfront_url: ${{ inputs.cloudfront_url }} + cloudfront_staging_url: ${{ inputs.cloudfront_staging_url }} + rocm_version: ${{ inputs.rocm_version }} + pytorch_git_ref: ${{ matrix.pytorch_git_ref }} + pytorch_patchset: ${{ matrix.pytorch_patchset }} + ref: ${{ inputs.ref || '' }} diff --git a/.github/workflows/release_windows_packages.yml b/.github/workflows/release_windows_packages.yml new file mode 100644 index 0000000000000..4c456b4d6489d --- /dev/null +++ b/.github/workflows/release_windows_packages.yml @@ -0,0 +1,360 @@ +name: Release Windows packages + +on: + # Trigger from another workflow (typically to build dev packages and then test them) + workflow_call: + inputs: + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + package_suffix: + type: string + s3_subdir: + description: "Subdirectory to push the Python packages" + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the packages" + type: string + default: "v2-staging" + families: + description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`, or empty for the default list" + type: string + prerelease_version: + description: "(Optional) Number of the prerelease" + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + # Trigger manually (typically to test the workflow or manually build a release [candidate]) + workflow_dispatch: + inputs: + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + package_suffix: + type: string + s3_subdir: + description: "Subdirectory to push the Python packages" + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the packages" + type: string + default: "v2-staging" + families: + description: "A comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`, or empty for the default list" + type: string + prerelease_version: + description: "(Optional) Number of the prerelease" + type: string + extra_cmake_options: + description: "Extra options to pass to the CMake configure command" + type: string + + # Trigger on a schedule to build nightly release candidates. + schedule: + # Runs at 04:00 AM UTC, which is 8:00 PM PST (UTC-8) + - cron: '0 04 * * *' + +permissions: + contents: read + +run-name: Release Windows packages (${{ inputs.families || 'default' }}, ${{ inputs.release_type || 'nightly' }}) + +jobs: + setup_metadata: + if: ${{ github.repository_owner == 'ROCm' || github.event_name != 'schedule' }} + runs-on: ubuntu-24.04 + env: + release_type: ${{ inputs.release_type || 'nightly' }} + outputs: + version: ${{ steps.rocm_package_version.outputs.rocm_package_version }} + release_type: ${{ env.release_type }} + package_targets: ${{ steps.configure.outputs.package_targets }} + cloudfront_url: ${{ steps.release_information.outputs.cloudfront_url }} + cloudfront_staging_url: ${{ steps.release_information.outputs.cloudfront_staging_url }} + s3_subdir_tar: ${{ steps.release_information.outputs.s3_subdir_tar }} + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Compute package version + id: rocm_package_version + run: | + python ./build_tools/compute_rocm_package_version.py \ + --release-type=${{ env.release_type }} \ + --prerelease-version=${{ inputs.prerelease_version }} + + - name: Set variables for nightly release + if: ${{ env.release_type == 'nightly' }} + run: | + echo "tmp_cloudfront_url=https://rocm.nightlies.amd.com/v2" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.nightlies.amd.com/v2-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV + + - name: Set variables for prerelease + if: ${{ env.release_type == 'prerelease' }} + run: | + echo "tmp_cloudfront_url=https://rocm.prereleases.amd.com/whl" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.prereleases.amd.com/whl-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=v3/tarball/" >> $GITHUB_ENV + + - name: Set variables for development release + if: ${{ env.release_type == 'dev' }} + run: | + echo "tmp_cloudfront_url=https://rocm.devreleases.amd.com/v2" >> $GITHUB_ENV + echo "tmp_cloudfront_staging_url=https://rocm.devreleases.amd.com/v2-staging" >> $GITHUB_ENV + echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV + + - name: Generate release information + id: release_information + run: | + echo "cloudfront_url=${tmp_cloudfront_url}" >> $GITHUB_OUTPUT + echo "cloudfront_staging_url=${tmp_cloudfront_staging_url}" >> $GITHUB_OUTPUT + echo "s3_subdir_tar=${tmp_s3_subdir_tar}" >> $GITHUB_OUTPUT + + - name: Generating package target matrix + id: configure + env: + AMDGPU_FAMILIES: ${{ inputs.families }} + THEROCK_PACKAGE_PLATFORM: "windows" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/fetch_package_targets.py + + windows_packages: + name: ${{ matrix.target_bundle.amdgpu_family }}::Build Windows + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }} + continue-on-error: ${{ matrix.target_bundle.expect_failure == true }} # for GPU families that are flaky, we mark as xfail + timeout-minutes: 720 # 12 hour timeout + needs: [setup_metadata] + permissions: + contents: write + actions: write # Added permission to trigger workflows + id-token: write # Added permission for AWS S3 upload + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }} + env: + TEATIME_LABEL_GH_GROUP: 1 + BUILD_DIR: B:\build + CACHE_DIR: "${{github.workspace}}/.cache" + CCACHE_DIR: "${{github.workspace}}/.cache/ccache" + CCACHE_MAXSIZE: "4000M" + DIST_ARCHIVE: "B:/build/artifacts/therock-dist-windows-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz" + RELEASE_TYPE: "${{ needs.setup_metadata.outputs.release_type }}" + S3_BUCKET_TAR: "therock-${{ needs.setup_metadata.outputs.release_type }}-tarball" + S3_SUBDIR_TAR: ${{ needs.setup_metadata.outputs.s3_subdir_tar }} + S3_BUCKET_PY: "therock-${{ needs.setup_metadata.outputs.release_type }}-python" + S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }} + S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }} + + steps: + - name: "Checking out repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Install python deps + run: | + pip install -r requirements.txt + + # TODO(amd-justchen): share with build_windows_artifacts.yml. Include in VM image? Dockerfile? + - name: Install requirements + run: | + choco install --no-progress -y ccache + # ninja pinned due to a bug in the 1.13.0 release: + # https://github.com/ninja-build/ninja/issues/2616 + choco install --no-progress -y ninja --version 1.12.1 + choco install --no-progress -y strawberryperl + echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH + choco install --no-progress -y awscli + choco install --no-progress -y pkgconfiglite + echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + + - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0 + with: + version: '3.62.0' + + # After other installs, so MSVC get priority in the PATH. + - name: Configure MSVC + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Runner health status + run: | + ccache --zero-stats + python ./build_tools/health_status.py + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Enable cache + uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: ${{ env.CACHE_DIR }} + key: windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }} + restore-keys: | + windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}- + + - name: Fetch sources + timeout-minutes: 30 + run: | + git config fetch.parallel 10 + git config --global core.symlinks true + git config --global core.longpaths true + python ./build_tools/fetch_sources.py --jobs 12 + + - name: Configure Projects + env: + amdgpu_families: ${{ matrix.target_bundle.amdgpu_family }} + package_version: "ADHOCBUILD" + extra_cmake_options: ${{ inputs.extra_cmake_options }} + run: | + # clear cache before build and after download + ccache -z + + python3 build_tools/github_actions/build_configure.py + + - name: Build therock-dist + run: cmake --build "${{ env.BUILD_DIR }}" --target therock-dist + + - name: Build therock-archives + run: cmake --build "${{ env.BUILD_DIR }}" --target therock-archives + + - name: Compress dist folder + run: | + cd ${{ env.BUILD_DIR }}/dist/rocm + echo "Compressing ${{ env.DIST_ARCHIVE }}" + tar cfz "${{ env.DIST_ARCHIVE }}" --force-local . + + - name: Build Python Packages + run: | + python ./build_tools/build_python_packages.py \ + --artifact-dir=${{ env.BUILD_DIR }}/artifacts \ + --dest-dir=${{ env.BUILD_DIR }}/packages \ + --version=${{ needs.setup_metadata.outputs.version }} + + - name: Build report + if: ${{ !cancelled() }} + shell: bash + run: | + if [ -d "${{ env.BUILD_DIR }}" ]; then + echo "Build dir:" + echo "------------" + ls -lh "${{ env.BUILD_DIR }}" + echo "CCache Stats:" + echo "-------------" + ccache -s + else + echo "[ERROR] Build directory ${{ env.BUILD_DIR }} does not exist. Skipping report!" + echo " This should only happen if the CI is cancelled before the build step." + exit 1 # Stop the CI as build did not happen + fi + + - name: Configure AWS Credentials + if: ${{ github.repository_owner == 'ROCm' && !cancelled() }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ env.RELEASE_TYPE }} + special-characters-workaround: true + + - name: Post Build Upload + if: ${{ github.repository_owner == 'ROCm' && !cancelled() }} + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group "${{ matrix.target_bundle.amdgpu_family }}" \ + --build-dir ${{ env.BUILD_DIR }} \ + --upload \ + --job-status ${{ job.status }} + + - name: Upload Releases to staging S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.BUILD_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \ + --recursive --no-follow-symlinks \ + --exclude "*" \ + --include "*.whl" \ + --include "*.tar.gz" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variable to be set for `manage.py` + CUSTOM_PREFIX: "${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + ## TODO: Restrict uploading to the non-staging S3 directory until sanity checks and all validation tests have successfully passed. + - name: Upload Releases to S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.DIST_ARCHIVE }} s3://${{ env.S3_BUCKET_TAR }}/${{ env.S3_SUBDIR_TAR }} + aws s3 cp ${{ env.BUILD_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \ + --recursive --no-follow-symlinks \ + --exclude "*" \ + --include "*.whl" \ + --include "*.tar.gz" + + # TODO(marbre): guard against race conditions where multiple workflows update the index at the same time? + # Moving the index computation server-side could help + - name: (Re-)Generate release index pages + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variable to be set for `manage.py` + CUSTOM_PREFIX: "${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + python ./build_tools/index_generation_s3_tar.py \ + --bucket ${{ env.S3_BUCKET_TAR }} \ + --directory ${{ env.S3_SUBDIR_TAR }} \ + --upload + + - name: Trigger building PyTorch wheels + if: ${{ github.repository_owner == 'ROCm' && matrix.target_bundle.expect_pytorch_failure == false }} + uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4 + with: + workflow: release_windows_pytorch_wheels.yml + inputs: | + { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}", + "release_type": "${{ env.RELEASE_TYPE }}", + "s3_subdir": "${{ env.S3_SUBDIR }}", + "s3_staging_subdir": "${{ env.S3_STAGING_SUBDIR }}", + "cloudfront_url": "${{ needs.setup_metadata.outputs.cloudfront_url }}", + "cloudfront_staging_url": "${{ needs.setup_metadata.outputs.cloudfront_staging_url }}", + "rocm_version": "${{ needs.setup_metadata.outputs.version }}", + "ref": "${{ inputs.ref || '' }}" + } + + - name: Save cache + uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + if: ${{ !cancelled() }} + with: + path: ${{ env.CACHE_DIR }} + key: windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }} diff --git a/.github/workflows/release_windows_pytorch_wheels.yml b/.github/workflows/release_windows_pytorch_wheels.yml new file mode 100644 index 0000000000000..85e0f6b88da81 --- /dev/null +++ b/.github/workflows/release_windows_pytorch_wheels.yml @@ -0,0 +1,110 @@ +name: Release Windows PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: Staging subdirectory to push the wheels for test + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx1151 + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: "Staging subdirectory to push the wheels for test" + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + default: '' + +permissions: + id-token: write + contents: read + +run-name: Release Windows PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.release_type }}, ${{ inputs.rocm_version }}) + +jobs: + release: + name: Release | ${{ inputs.amdgpu_family }} | py ${{ matrix.python_version }} | torch ${{ matrix.pytorch_git_ref }} + strategy: + fail-fast: false + matrix: + python_version: ["3.11", "3.12", "3.13"] + pytorch_git_ref: ["release/2.9", "nightly"] + include: + - pytorch_git_ref: release/2.9 + pytorch_patchset: rocm_2.9 + - pytorch_git_ref: nightly + pytorch_patchset: nightly + + uses: ./.github/workflows/build_windows_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + python_version: ${{ matrix.python_version }} + release_type: ${{ inputs.release_type }} + s3_subdir: ${{ inputs.s3_subdir }} + s3_staging_subdir: ${{ inputs.s3_staging_subdir }} + cloudfront_url: ${{ inputs.cloudfront_url }} + cloudfront_staging_url: ${{ inputs.cloudfront_staging_url }} + rocm_version: ${{ inputs.rocm_version }} + pytorch_git_ref: ${{ matrix.pytorch_git_ref }} + pytorch_patchset: ${{ matrix.pytorch_patchset }} + ref: ${{ inputs.ref || '' }} diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml new file mode 100644 index 0000000000000..e2e094a80a12c --- /dev/null +++ b/.github/workflows/setup.yml @@ -0,0 +1,92 @@ +name: Setup + +on: + workflow_call: + inputs: + build_variant: + type: string + default: "release" + multi_arch: + type: boolean + default: false + description: "If true, group all families into one entry per build_variant instead of expanding cross-product" + outputs: + enable_build_jobs: + description: Whether to enable build jobs. + value: ${{ jobs.setup.outputs.enable_build_jobs }} + linux_variants: + description: Matrix variants to run on Linux + value: ${{ jobs.setup.outputs.linux_variants }} + linux_test_labels: + description: ROCm projects to run Linux tests on. Optional filter. + value: ${{ jobs.setup.outputs.linux_test_labels }} + windows_variants: + description: Matrix variants to run on Windows. + value: ${{ jobs.setup.outputs.windows_variants }} + test_type: + description: The test type to run for component tests (i.e. smoke, full) + value: ${{ jobs.setup.outputs.test_type }} + windows_test_labels: + description: ROCm projects to run Windows tests on. Optional filter. + value: ${{ jobs.setup.outputs.windows_test_labels }} + rocm_package_version: + description: ROCm package version (primarily for Python packages). + value: ${{ jobs.setup.outputs.rocm_package_version }} + +permissions: + contents: read + +jobs: + setup: + runs-on: ubuntu-24.04 + env: + # The commit being checked out is the merge commit for a PR. Its first + # parent will be the tip of the base branch. + BASE_REF: HEAD^ + outputs: + enable_build_jobs: ${{ steps.configure.outputs.enable_build_jobs }} + linux_variants: ${{ steps.configure.outputs.linux_variants }} + linux_test_labels: ${{ steps.configure.outputs.linux_test_labels }} + windows_variants: ${{ steps.configure.outputs.windows_variants }} + test_type: ${{ steps.configure.outputs.test_type }} + windows_test_labels: ${{ steps.configure.outputs.windows_test_labels }} + rocm_package_version: ${{ steps.rocm_package_version.outputs.rocm_package_version }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + fetch-depth: 10 + - name: SHA of TheRock + run: | + git rev-parse HEAD + git log -1 + - name: Set PR_LABELS variable with labels assigned to pull request + if: ${{ github.event.pull_request }} # only set PR labels var if this is a pull request + env: + GITHUB_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.number }} + run: | + echo "PR_LABELS=$(gh pr view ${PR_NUMBER} --json labels)" >> $GITHUB_ENV + + - name: Configuring CI options + id: configure + env: + #INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }} + INPUT_LINUX_AMDGPU_FAMILIES: "gfx94X" + LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }} + LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }} + #INPUT_WINDOWS_AMDGPU_FAMILIES: ${{ github.event.inputs.windows_amdgpu_families }} + INPUT_WINDOWS_AMDGPU_FAMILIES: "gfx1151" + WINDOWS_TEST_LABELS: ${{ github.event.inputs.windows_test_labels }} + WINDOWS_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.windows_use_prebuilt_artifacts }} + BUILD_VARIANT: ${{ inputs.build_variant }} + MULTI_ARCH: ${{ inputs.multi_arch }} + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: ./build_tools/github_actions/configure_ci.py + + - name: Compute package version + id: rocm_package_version + run: python ./build_tools/compute_rocm_package_version.py --release-type=dev diff --git a/.github/workflows/test_artifacts.yml b/.github/workflows/test_artifacts.yml new file mode 100644 index 0000000000000..6bac4f4766612 --- /dev/null +++ b/.github/workflows/test_artifacts.yml @@ -0,0 +1,119 @@ +name: Test Artifacts + +on: + workflow_dispatch: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + test_runs_on: + type: string + sanity_check_only_for_family: + type: boolean + default: false + test_type: + type: string + test_labels: + type: string + workflow_call: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + test_runs_on: + type: string + sanity_check_only_for_family: + type: boolean + default: false + test_type: + type: string + test_labels: + type: string + push: + branches: + - ADHOCBUILD + +permissions: + contents: read + +jobs: + configure_test_matrix: + name: "Configure test matrix" + # if there is a test machine available + if: ${{ inputs.test_runs_on != '' }} + runs-on: ${{ inputs.test_runs_on }} + outputs: + components: ${{ steps.configure.outputs.components }} + platform: ${{ steps.configure.outputs.platform }} + shard_arr: ${{ steps.configure.outputs.shard_arr }} + steps: + - name: "Fetch 'build_tools' from repository" + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + sparse-checkout: build_tools + path: "prejob" + + # Checkout failure is possible on Windows, as it's the first job on a GPU test runner. + # Post-job cleanup isn't necessary since no executables are launched in this job. + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: "Checking out repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Setting up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: "Configuring CI options" + id: configure + env: + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + TEST_TYPE: ${{ inputs.test_type }} + TEST_LABELS: ${{ inputs.test_labels }} + run: python ./build_tools/github_actions/fetch_test_configurations.py + + test_sanity_check: + name: 'Test Sanity Check' + needs: configure_test_matrix + uses: './.github/workflows/test_sanity_check.yml' + with: + artifact_group: ${{ inputs.artifact_group }} + artifact_run_id: ${{ inputs.artifact_run_id }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + platform: ${{ needs.configure_test_matrix.outputs.platform }} + + test_components: + name: 'Test ${{ matrix.components.job_name }}' + needs: [test_sanity_check, configure_test_matrix] + # skip tests if no test matrix to run and sanity check only requested + if: ${{ needs.configure_test_matrix.outputs.components != '[]' && !inputs.sanity_check_only_for_family }} + strategy: + fail-fast: false + matrix: + components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }} + uses: './.github/workflows/test_component.yml' + with: + artifact_run_id: ${{ inputs.artifact_run_id }} + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + platform: ${{ needs.configure_test_matrix.outputs.platform }} + component: ${{ toJSON(matrix.components) }} diff --git a/.github/workflows/test_component.yml b/.github/workflows/test_component.yml new file mode 100644 index 0000000000000..7d6796f035425 --- /dev/null +++ b/.github/workflows/test_component.yml @@ -0,0 +1,107 @@ +name: Test component + +on: + workflow_call: + inputs: + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + amdgpu_families: + type: string + test_runs_on: + type: string + platform: + type: string + component: + type: string + +permissions: + contents: read + +jobs: + test_component: + name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})' + runs-on: ${{ inputs.test_runs_on }} + timeout-minutes: 210 + container: + image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + strategy: + fail-fast: false + matrix: + # The shard array is based on "total_shards" from "fetch_test_configurations.py" + # The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards) + shard: ${{ fromJSON(inputs.component).shard_arr }} + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + OUTPUT_ARTIFACTS_DIR: "./build" + THEROCK_BIN_DIR: "./build/bin" + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + steps: + - name: "Fetch 'build_tools' from repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + sparse-checkout: build_tools + path: "prejob" + + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }} + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + # safe.directory must be set before Runner Health Status + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Runner health status + run: | + python ./build_tools/health_status.py + + - name: Driver / GPU sanity check + run: | + python ./build_tools/print_driver_gpu_info.py + + - name: Test + timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }} + env: + SHARD_INDEX: ${{ matrix.shard }} + TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }} + TEST_TYPE: ${{ fromJSON(inputs.component).test_type }} + run: | + ${{ fromJSON(inputs.component).test_script }} + + # GitHub's 'Complete job' step is unaware of launched executables + # and will fail to clean up orphan processes. + - name: Post-job cleanup processes on Windows + if: ${{ always() && runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1' diff --git a/.github/workflows/test_jax_dockerfile.yml b/.github/workflows/test_jax_dockerfile.yml new file mode 100644 index 0000000000000..a577dbe5e4ef0 --- /dev/null +++ b/.github/workflows/test_jax_dockerfile.yml @@ -0,0 +1,54 @@ +name: Test JAX Wheels + +on: + workflow_dispatch: + inputs: + test_runs_on: + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + image_name: + required: true + description: JAX docker image to run tests with + type: string + jax_version: + description: Version of JAX to install + required: false + type: string + jax_plugin_branch: + required: true + description: JAX plugin branch to checkout + type: string + default: "rocm-jaxlib-v0.6.0" + + workflow_call: + inputs: + test_runs_on: + required: true + type: string + image_name: + required: true + description: JAX docker image to run tests with + type: string + jax_version: + description: Version of JAX to install instead of the one on the docker image + required: false + type: string + jax_plugin_branch: + description: JAX plugin branch to checkout to use for test scripts + type: string + default: "rocm-jaxlib-v0.8.0" + +permissions: + contents: read + +jobs: + test_wheels: + name: Test + runs-on: ${{ inputs.test_runs_on }} + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repo: rocm/rocm-jax + # TODO: Add steps for creating the JAX docker image with an install of TheRock and then running JAX tests on the container diff --git a/.github/workflows/test_linux_jax_wheels.yml b/.github/workflows/test_linux_jax_wheels.yml new file mode 100644 index 0000000000000..00823960f1b0d --- /dev/null +++ b/.github/workflows/test_linux_jax_wheels.yml @@ -0,0 +1,203 @@ +name: Test Linux JAX Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + release_type: + required: true + type: string + s3_subdir: + required: true + type: string + package_index_url: + description: Base CloudFront URL for the Python package index + required: true + type: string + rocm_version: + description: ROCm version (optional, informational) + required: false + type: string + tar_url: + description: URL to TheRock tarball to configure ROCm + required: true + type: string + python_version: + description: Python version(s) to test (e.g., "3.12") + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + jax_ref: + description: rocm-jax repository ref/branch to check out + required: false + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + test_runs_on: + required: true + type: string + + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + release_type: + description: The type of release ("nightly" or "dev") + required: true + type: string + default: dev + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + default: v2 + package_index_url: + description: Base CloudFront URL for the Python package index + required: true + type: string + default: https://rocm.nightlies.amd.com/v2-staging/ + rocm_version: + description: ROCm version + required: false + type: string + tar_url: + description: URL to TheRock tarball to configure ROCm + required: true + type: string + python_version: + description: Python version(s) to test (e.g., "3.12") + required: true + type: string + default: "3.12" + jax_ref: + description: rocm-jax repository ref/branch to check out + required: false + type: string + test_runs_on: + description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + +permissions: + contents: read + packages: read + +jobs: + test_jax_wheels: + name: Test JAX Wheels | ${{ inputs.amdgpu_family }} + runs-on: ${{ inputs.test_runs_on }} + container: + image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26 + options: >- + --device /dev/kfd + --device /dev/dri + --group-add render + --group-add video + --user root + --env-file /etc/podinfo/gha-gpu-isolation-settings + defaults: + run: + shell: bash + env: + VIRTUAL_ENV: ${{ github.workspace }}/.venv + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + THEROCK_TAR_URL: ${{ inputs.tar_url }} + PYTHON_VERSION: ${{ inputs.python_version }} + WHEEL_INDEX_URL: ${{ inputs.package_index_url }}/${{ inputs.amdgpu_family }} + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Checkout rocm-jax (plugin + build scripts) + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + path: jax + repository: rocm/rocm-jax + ref: ${{ inputs.jax_ref }} + + - name: Checkout JAX extended tests repo + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: rocm/jax + ref: ${{ inputs.jax_ref }} + path: jax/jax_tests + + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + check-latest: true + + - name: System deps, venv configure + run: | + python3 -m venv "${VIRTUAL_ENV}" + echo "PATH=${VIRTUAL_ENV}/bin:${PATH}" >> "$GITHUB_ENV" + python3 build_tools/setup_venv.py "${VIRTUAL_ENV}" --activate-in-future-github-actions-steps + + - name: Install base JAX test requirements + run: | + # This script sets up the venv and activates it across steps; keep it consistent + pip install -r external-builds/jax/requirements-jax.txt + + - name: Configure ROCm from TheRock tarball + env: + ROCM_VERSION: ${{ inputs.rocm_version }} + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + run: | + DEST="/opt/rocm-${{ inputs.rocm_version }}" + # Install directly from TheRock release buckets (nightly/dev) using the provided version + python build_tools/install_rocm_from_artifacts.py \ + --release "${{ inputs.rocm_version }}" \ + --artifact-group "${{ inputs.amdgpu_family }}" \ + --output-dir "${DEST}" + + - name: Extract JAX version and set to GITHUB_ENV + run: | + # Extract JAX version from requirements.txt (e.g., "jax==0.8.0") + # Remove all whitespace from requirements.txt to simplify parsing + # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments) + # Extract the version number by splitting on '=' and taking the 3rd field + # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments + JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \ + | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3) + echo "JAX_VERSION=$JAX_VERSION" >> "$GITHUB_ENV" + + - name: Install JAX wheels from package index + run: | + # Install jaxlib/plugin/pjrt from the GPU-family index; install jax from PyPI to match the version + pip install --index-url "${{ env.WHEEL_INDEX_URL }}" \ + "jaxlib==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \ + "jax-rocm7-plugin==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \ + "jax-rocm7-pjrt==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" + pip install --extra-index-url https://pypi.org/simple "jax==${JAX_VERSION}" + + - name: Run JAX tests + run: | + pytest jax/jax_tests/tests/multi_device_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/core_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/util_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/scipy_stats_test.py -q --log-cli-level=INFO diff --git a/.github/workflows/test_pytorch_wheels.yml b/.github/workflows/test_pytorch_wheels.yml new file mode 100644 index 0000000000000..93fe73a704412 --- /dev/null +++ b/.github/workflows/test_pytorch_wheels.yml @@ -0,0 +1,190 @@ +name: Test PyTorch Wheels + +on: + workflow_dispatch: + inputs: + amdgpu_family: + description: GPU family to test + required: true + type: string + default: "gfx94X-dcgpu" + test_runs_on: + description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + package_index_url: + description: Base Python package index URL to test, typically nightly/dev URL with a "v2" or "v2-staging" subdir (without a GPU family subdir) + required: true + type: string + default: "https://rocm.nightlies.amd.com/v2" + python_version: + required: true + type: string + default: "3.12" + torch_version: + description: torch package version to install. (e.g. "2.7.1+rocm7.10.0a20251120") + required: true + type: string + pytorch_git_ref: + description: PyTorch ref to checkout test sources from. (e.g. "nightly", or "release/2.7") + type: string + default: "release/2.7" + + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + test_runs_on: + required: true + type: string + package_index_url: + required: true + type: string + python_version: + required: true + type: string + torch_version: + required: true + type: string + pytorch_git_ref: + type: string + default: "release/2.7" + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + +permissions: + contents: read + +run-name: Test PyTorch (${{ inputs.amdgpu_family }}, ${{ inputs.torch_version}}, ${{ inputs.test_runs_on }}) + +jobs: + test_wheels: + name: Test PyTorch | ${{ inputs.amdgpu_family }} + runs-on: ${{ inputs.test_runs_on }} + container: + image: ${{ contains(inputs.test_runs_on, 'linux') && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + # TODO: also upload and reference test report together with this logging? + - name: Summarize workflow inputs + run: | + python build_tools/github_actions/summarize_test_pytorch_workflow.py \ + --torch-version=${{ inputs.torch_version }} \ + --pytorch-git-ref=${{ inputs.pytorch_git_ref }} \ + --index-url=${{ inputs.package_index_url }} \ + --index-subdir=${{ inputs.amdgpu_family }} + + - name: Set git options + run: | + git config --global core.longpaths true + + # Here we checkout the same version of PyTorch that wheels were built from + # so we have the right set of test source files. We _probably_ don't need + # to run HIPIFY or apply any patches, so we skip those steps to save time. + - name: Checkout PyTorch Source Repos from nightly branch + if: ${{ (inputs.pytorch_git_ref == 'nightly') }} + run: | + python external-builds/pytorch/pytorch_torch_repo.py checkout \ + --gitrepo-origin https://github.com/pytorch/pytorch.git \ + --repo-hashtag nightly \ + --no-hipify --no-patch + + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ (inputs.pytorch_git_ref != 'nightly') }} + run: | + python external-builds/pytorch/pytorch_torch_repo.py checkout \ + --gitrepo-origin https://github.com/ROCm/pytorch.git \ + --repo-hashtag ${{ inputs.pytorch_git_ref }} \ + --no-hipify --no-patch + + - name: Set up virtual environment + run: | + python build_tools/setup_venv.py ${VENV_DIR} \ + --packages torch==${{ inputs.torch_version }} \ + --index-url=${{ inputs.package_index_url }} \ + --index-subdir=${{ inputs.amdgpu_family }} \ + --activate-in-future-github-actions-steps + + - name: Install test requirements + run: | + python -m pip install -r external-builds/pytorch/requirements-test.txt + pip freeze + + - name: Run rocm-sdk sanity tests + run: | + rocm-sdk test + + - name: Run PyTorch smoketests + run: | + python ./external-builds/pytorch/run_pytorch_smoke_tests.py -- \ + --log-cli-level=INFO \ + -v + + - name: (Linux) Run PyTorch tests + if: ${{ runner.os == 'Linux' }} + run: | + python ./external-builds/pytorch/run_pytorch_tests.py -- \ + --continue-on-collection-errors \ + --import-mode=importlib \ + -v + + # Windows testing is a recent addition and is being enabled incrementally. + # See https://github.com/ROCm/TheRock/issues/2258. + # + # Many tests are failing on torch 2.10+ so we limit testing to 2.9. + # (Obviously that's not ideal, but we need to start somewhere) + # + # HACK: The test process does not terminate on its own gracefully, + # so we write to run_pytorch_tests_exit_code.txt and then kill the process. + # After killing the process we read the return code to signal it normally. + # See https://github.com/ROCm/TheRock/issues/999. + - name: (Windows) Run PyTorch tests + if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }} + continue-on-error: true + run: | + python ./external-builds/pytorch/run_pytorch_tests.py -- \ + --continue-on-collection-errors \ + --import-mode=importlib \ + -v + + - name: (Windows) Read and propagate exit code + if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }} + run: | + if [ -f run_pytorch_tests_exit_code.txt ]; then + EXIT_CODE=$(cat run_pytorch_tests_exit_code.txt) + echo "Exit code from file: ${EXIT_CODE}" + exit ${EXIT_CODE} + else + echo "No run_pytorch_tests_exit_code.txt found" + exit 1 + fi diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml new file mode 100644 index 0000000000000..0098630e4894e --- /dev/null +++ b/.github/workflows/test_sanity_check.yml @@ -0,0 +1,117 @@ +name: TheRock Sanity Check + +on: + workflow_dispatch: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + default: "" + test_runs_on: + type: string + platform: + type: string + workflow_call: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + default: "" + test_runs_on: + type: string + platform: + type: string + push: + branches: + - ADHOCBUILD + +permissions: + contents: read + +jobs: + test_sanity_check: + name: "Sanity ROCM Test" + runs-on: ${{ inputs.test_runs_on }} + container: + image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build + THEROCK_BIN_DIR: ${{ github.workspace }}/build/bin + steps: + - name: "Fetch 'build_tools' from repository" + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + sparse-checkout: build_tools + path: prejob + + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Pre-job cleanup Docker containers on Linux + if: ${{ runner.os == 'Linux' }} + shell: bash + run: | + # Remove any stopped containers + docker container prune -f || true + # Remove dangling networks + docker network prune -f || true + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: "--base-only" + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + - name: Set HIP_CLANG_PATH for windows + if: ${{ runner.os == 'Windows' }} + run: echo "HIP_CLANG_PATH=${OUTPUT_ARTIFACTS_DIR}\lib\llvm\bin" >> $GITHUB_ENV + + - name: Driver / GPU sanity check + run: | + python ./build_tools/print_driver_gpu_info.py + + - name: Run ROCm Sanity Tests + timeout-minutes: 5 + env: + # Enable verbose logging, see + # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html + AMD_LOG_LEVEL: 4 + run: | + pytest tests/ --log-cli-level=info --timeout=60 + + - name: Post-job cleanup processes on Windows + if: ${{ always() && runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1' diff --git a/.github/workflows/therock_test_harness.yml b/.github/workflows/therock_test_harness.yml new file mode 100644 index 0000000000000..1699af369a140 --- /dev/null +++ b/.github/workflows/therock_test_harness.yml @@ -0,0 +1,101 @@ +name: TheRock Test Harness + +on: + workflow_dispatch: + inputs: + families: + type: string + description: 'The AMD GPU family to test. ex: gfx94X, gfx120X' + default: 'gfx94X' + release_version: + type: string + description: 'TheRock release version. (ex: nightly-tarball (X.Y.ZrcYYYYMMDD) or dev-tarball (X.Y.Z.dev0+{hash}))' + default: '7.9.0rc20251008' + tests_to_run: + type: string + description: 'The list of tests to run with "or" expression. (ex: "hipcub or rocprim")' + default: 'hipcub or rocprim or rocrand or rocthrust' + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup_metadata: + runs-on: ubuntu-24.04 + outputs: + package_targets: ${{ steps.configure.outputs.package_targets }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Generating package target matrix + id: configure + env: + AMDGPU_FAMILIES: ${{ inputs.families }} + THEROCK_PACKAGE_PLATFORM: "linux" + TEST_HARNESS_TARGET_FETCH: true + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/fetch_package_targets.py + + + therock_test_harness_linux: + name: TheRock Tests Sharded Linux Nightly + needs: [setup_metadata] + runs-on: ${{ matrix.target_bundle.test_machine }} + container: + image: 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + strategy: + fail-fast: false + matrix: + target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }} + defaults: + run: + shell: bash + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Install TheRock + env: + release_version: ${{ inputs.release_version }} + run: | + pip install -r requirements-test.txt + python3 build_tools/install_rocm_from_artifacts.py --tests --amdgpu-family ${{ matrix.target_bundle.amdgpu_family }} --release ${{ env.release_version }} + + # TODO: add parallelism + - name: Running test harness + # TESTING + run: | + python3 -m pytest -s -v --tb=short --therock-path=./therock-build tests/harness/tests*.py -k ${{ inputs.tests_to_run }} + +# TODO: Add windows tests diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index af28bf0169108..0db12e8c7f8be 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1,4 +1,4 @@ - +FAIL BUILD # See docs/CMake.html for instructions about how to build LLVM with CMake. cmake_minimum_required(VERSION 3.20.0)