Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 176 additions & 44 deletions dev/airflow-github
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ from rich.progress import Progress

if TYPE_CHECKING:
from github.Issue import Issue
from github.PullRequest import PullRequest

GIT_COMMIT_FIELDS = ["id", "author_name", "author_email", "date", "subject", "body"]
GIT_LOG_FORMAT = "%x1f".join(["%h", "%an", "%ae", "%ad", "%s", "%b"]) + "%x1e"
Expand Down Expand Up @@ -95,22 +94,86 @@ def get_issue_type(issue):
return issue_type


def get_commit_in_main_associated_with_pr(repo: git.Repo, issue: Issue) -> str | None:
def build_main_commits_cache(repo: git.Repo, issue_numbers: set[int]) -> dict[int, str]:
"""Build a cache of PR number -> main branch commit SHA with a single git log operation"""
cache = {}

if not issue_numbers:
return cache

try:
# Single git log to get all main branch commits
# Use a reasonable range to avoid scanning entire history
try:
# Try to get commits since last major version (should cover most PR ranges)
log_output = repo.git.log("origin/main", "--format=%H %s", "--since=1 year ago")
except Exception:
# Fallback to last 2000 commits if date fails
log_output = repo.git.log("origin/main", "--format=%H %s", "-2000")

# Use regex to find all PR numbers in commit messages at once
import re
pr_pattern = r"\(#(\d+)\)$" # PR number at end of commit message

for commit_line in log_output.splitlines():
if not commit_line:
continue

# Find PR number at end of commit message
match = re.search(pr_pattern, commit_line)
if match:
pr_number = int(match.group(1))
if pr_number in issue_numbers:
commit_sha = commit_line.split(" ")[0]
cache[pr_number] = commit_sha

except Exception:
# Fallback to empty cache if git operation fails
pass

return cache


def get_commit_in_main_associated_with_pr(repo: git.Repo, issue: Issue, main_commits_cache: dict[int, str]) -> str | None:
"""For a PR, find the associated merged commit & return its SHA"""
if issue.pull_request:
log_output = repo.git.log(f"--grep=(#{issue.number})$", "origin/main", "--format=%H %s")
if log_output:
for commit_line in log_output.splitlines():
# We only want the commit for the PR where squash-merge added (#PR) at the end of subject
if commit_line and commit_line.endswith(f"(#{issue.number})"):
return commit_line.split(" ")[0]
return None
pr: PullRequest = issue.as_pull_request()
if pr.is_merged():
return pr.merge_commit_sha
# Use cache (should be pre-populated)
return main_commits_cache.get(issue.number)
return None


def build_cherrypicked_cache(repo: git.Repo, issue_numbers: list[int], previous_version: str | None = None) -> dict[int, bool]:
"""Build a cache of which issues are cherry-picked by doing a single git log operation"""
cache = {num: False for num in issue_numbers}

if not issue_numbers:
return cache

# Get all commits in range and process them
log_args = ["--format=%H %s"]
if previous_version:
log_args.append(previous_version + "..")

try:
log_output = repo.git.log(*log_args)
# Use regex to find all PR numbers in the entire log output at once
import re
pr_pattern = r"\(#(\d+)\)"

for commit_line in log_output.splitlines():
# Find all PR numbers in this commit
matches = re.findall(pr_pattern, commit_line)
for match in matches:
issue_num = int(match)
if issue_num in cache:
cache[issue_num] = True
except Exception:
# Fallback to individual checks if batch fails
pass

return cache


def is_cherrypicked(repo: git.Repo, issue: Issue, previous_version: str | None = None) -> bool:
"""Check if a given issue is cherry-picked in the current branch or not"""
log_args = ["--format=%H %s", f"--grep=(#{issue.number})"]
Expand Down Expand Up @@ -236,62 +299,128 @@ def cli():
" searching for few commits to find the cherry-picked commits",
)
@click.option("--unmerged", "show_uncherrypicked_only", help="Show unmerged PRs only", is_flag=True)
def compare(target_version, github_token, previous_version=None, show_uncherrypicked_only=False):

@click.option("--show-commits", help="Show commit SHAs (default: on, off when --unmerged)", is_flag=True, default=None)

def compare(target_version, github_token, previous_version=None, show_uncherrypicked_only=False, show_commits=None):
# Set smart defaults
if show_commits is None:
show_commits = not show_uncherrypicked_only # Default off for --unmerged

repo = git.Repo(".", search_parent_directories=True)

github_handler = Github(github_token)
milestone_issues: list[Issue] = list(

# Fetch PRs and Issues separately, with merged PRs identified upfront
merged_prs: list[Issue] = list(
github_handler.search_issues(
f'repo:apache/airflow milestone:"Airflow {target_version}" is:pull-request is:merged'
)
)
closed_prs: list[Issue] = list(
github_handler.search_issues(
f'repo:apache/airflow milestone:"Airflow {target_version}" is:pull-request is:closed -is:merged'
)
)
open_prs: list[Issue] = list(
github_handler.search_issues(
f'repo:apache/airflow milestone:"Airflow {target_version}" is:pull-request '
f'repo:apache/airflow milestone:"Airflow {target_version}" is:pull-request is:open'
)
)
milestone_issues.extend(
list(

# Skip fetching issues if we only care about unmerged PRs
if show_uncherrypicked_only:
issues = []
else:
issues: list[Issue] = list(
github_handler.search_issues(
f'repo:apache/airflow milestone:"Airflow {target_version}" is:issue '
f'repo:apache/airflow milestone:"Airflow {target_version}" is:issue'
)
)
)

# Create a merge status lookup
pr_merge_status_cache = {}
for pr in merged_prs:
pr_merge_status_cache[pr.number] = True
for pr in closed_prs:
pr_merge_status_cache[pr.number] = False
# Open PRs are neither merged nor closed, so we don't need to cache them

milestone_issues = merged_prs + closed_prs + open_prs + issues

num_cherrypicked = 0
num_uncherrypicked = Counter()

# :<18 says left align, pad to 18, :>6 says right align, pad to 6
# :<50.50 truncates after 50 chars
# !s forces as string
formatstr = (
"{number:>6} | {typ!s:<5} | {changelog!s:<13} | {status!s} "
"| {title:<83.83} | {merged:<6} | {commit:>7.7} | {url}"
)

print(
formatstr.format(
number="NUMBER",
typ="TYPE",
changelog="CHANGELOG",
status="STATUS".ljust(6),
title="TITLE",
merged="MERGED",
commit="COMMIT",
url="URL",
if show_commits:
formatstr = (
"{number:>6} | {typ!s:<5} | {changelog!s:<13} | {status!s} "
"| {title:<83.83} | {merged:<6} | {commit:>7.7} | {url}"
)
)
header_fields = {
"number": "NUMBER",
"typ": "TYPE",
"changelog": "CHANGELOG",
"status": "STATUS".ljust(6),
"title": "TITLE",
"merged": "CHERRY",
"commit": "COMMIT",
"url": "URL",
}
else:
formatstr = (
"{number:>6} | {typ!s:<5} | {changelog!s:<13} | {status!s} "
"| {title:<95.95} | {merged:<6} | {url}"
)
header_fields = {
"number": "NUMBER",
"typ": "TYPE",
"changelog": "CHANGELOG",
"status": "STATUS".ljust(6),
"title": "TITLE",
"merged": "CHERRY",
"commit": "", # Not used
"url": "URL",
}

print(formatstr.format(**header_fields))
milestone_issues = sorted(
milestone_issues, key=lambda x: x.closed_at if x.closed_at else x.created_at, reverse=True
)

# Build caches for performance optimization
issue_numbers = [issue.number for issue in milestone_issues if is_pr(issue)]

# Convert to set for O(1) lookups in cache building
issue_numbers_set = set(issue_numbers)

# Build all caches upfront with batch operations
if show_commits:
main_commits_cache = build_main_commits_cache(repo, issue_numbers_set)
else:
main_commits_cache = {}

cherrypicked_cache = build_cherrypicked_cache(repo, issue_numbers, previous_version)

for issue in milestone_issues:
commit_in_main = get_commit_in_main_associated_with_pr(repo, issue)
issue_is_pr = is_pr(issue)

# Determine status - differentiate between Closed and Merged for PRs
# Determine status - differentiate between Closed and Merged for PRs using cache
if issue_is_pr and issue.state == "closed":
pr = issue.as_pull_request()
status = "Merged" if pr.is_merged() else "Closed"
is_merged = pr_merge_status_cache.get(issue.number, False)
status = "Merged" if is_merged else "Closed"
else:
status = issue.state.capitalize()

# Checks if commit was cherrypicked into branch.
if is_cherrypicked(repo, issue, previous_version):
# Checks if commit was cherrypicked into branch using cache
if issue_is_pr:
is_cherry_picked = cherrypicked_cache.get(issue.number, False)
else:
is_cherry_picked = is_cherrypicked(repo, issue, previous_version)

if is_cherry_picked:
num_cherrypicked += 1
if show_uncherrypicked_only:
continue
Expand All @@ -314,9 +443,12 @@ def compare(target_version, github_token, previous_version=None, show_uncherrypi
url=issue.html_url,
)

print(
formatstr.format(**fields, merged=cherrypicked, commit=commit_in_main if commit_in_main else "")
)
# Only get commit info if we're showing commits
if show_commits:
commit_in_main = get_commit_in_main_associated_with_pr(repo, issue, main_commits_cache)
fields["commit"] = commit_in_main if commit_in_main else ""

print(formatstr.format(**fields, merged=cherrypicked))

print(
f"Commits on branch: {num_cherrypicked:d}, {sum(num_uncherrypicked.values()):d} "
Expand Down
Loading