Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize update_commit_from_provider_info #855

Merged
merged 3 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 106 additions & 100 deletions services/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,90 +90,94 @@
)


@sentry_sdk.trace
async def fetch_appropriate_parent_for_commit(
repository_service, commit: Commit, git_commit=None
):
repository_service: TorngitBaseAdapter, commit: Commit, git_commit=None
) -> str | None:
closest_parent_without_message = None
db_session = commit.get_db_session()
commitid = commit.commitid
if git_commit:
parents = git_commit["parents"]
possible_commit_query = db_session.query(Commit).filter(
possible_commit_query = db_session.query(Commit.commitid, Commit.branch).filter(
Commit.commitid.in_(parents),
Commit.repoid == commit.repoid,
~Commit.message.is_(None),
~Commit.deleted.is_(True),
)
possible_commit = possibly_filter_out_branch(commit, possible_commit_query)
possible_commit = _possibly_filter_out_branch(commit, possible_commit_query)
if possible_commit:
return possible_commit.commitid

ancestors_tree = await repository_service.get_ancestors_tree(commitid)
elements = [ancestors_tree]
while elements:
parents = [k for el in elements for k in el["parents"]]
parent_commits = [p["commitid"] for p in parents]
closest_parent_query = db_session.query(Commit).filter(
closest_parent_query = db_session.query(Commit.commitid, Commit.branch).filter(
Commit.commitid.in_(parent_commits),
Commit.repoid == commit.repoid,
~Commit.message.is_(None),
~Commit.deleted.is_(True),
)
closest_parent = possibly_filter_out_branch(commit, closest_parent_query)
closest_parent = _possibly_filter_out_branch(commit, closest_parent_query)
if closest_parent:
return closest_parent.commitid

if closest_parent_without_message is None:
res = db_session.query(Commit.commitid).filter(
parent_query = db_session.query(Commit.commitid, Commit.branch).filter(
Commit.commitid.in_(parent_commits),
Commit.repoid == commit.repoid,
~Commit.deleted.is_(True),
)
res = possibly_filter_out_branch(commit, res)
if res:
closest_parent_without_message = res[0]
parent = _possibly_filter_out_branch(commit, parent_query)
if parent:
closest_parent_without_message = parent.commitid
elements = parents

log.warning(
"Unable to find a parent commit that was properly found on Github",
extra=dict(commit=commit.commitid, repoid=commit.repoid),
)
return closest_parent_without_message


def possibly_filter_out_branch(commit: Commit, query: Query) -> Commit | None:
if query.count() <= 1:
query = query.first()
else:
query = query.filter(
Commit.branch == commit.branch,
).first()
return query
def _possibly_filter_out_branch(commit: Commit, query: Query) -> Commit | None:
commits = query.all()
if len(commits) == 1:
return commits[0]

# if we have more than one possible commit, pick the first one with a matching `branch`:
for possible_commit in commits:
if possible_commit.branch == commit.branch:
return possible_commit

async def possibly_update_commit_from_provider_info(commit, repository_service):
repoid = commit.repoid
commitid = commit.commitid
return None


def possibly_update_commit_from_provider_info(
commit: Commit, repository_service: TorngitBaseAdapter
) -> bool:
try:
if not commit.message:
log.info(
"Commit does not have all needed info. Reaching provider to fetch info",
extra=dict(repoid=repoid, commit=commitid),
"Commit does not have all needed info. Reaching provider to fetch info"
)
await update_commit_from_provider_info(repository_service, commit)
async_to_sync(update_commit_from_provider_info)(repository_service, commit)
return True
except TorngitObjectNotFoundError:
log.warning(
"Could not update commit with info because it was not found at the provider",
extra=dict(repoid=repoid, commit=commitid),
"Could not update commit with info because it was not found at the provider"
)
return False
log.debug(
"Not updating commit because it already seems to be populated",
extra=dict(repoid=repoid, commit=commitid),
)
log.debug("Not updating commit because it already seems to be populated")
return False


@sentry_sdk.trace
async def update_commit_from_provider_info(repository_service, commit):
async def update_commit_from_provider_info(
repository_service: TorngitBaseAdapter, commit: Commit
):
"""
Takes the result from the torngit commit details, and updates the commit
properties with it
Expand All @@ -187,82 +191,84 @@
"Could not find commit on git provider",
extra=dict(repoid=commit.repoid, commit=commit.commitid),
)
else:
log.debug("Found git commit", extra=dict(commit=git_commit))
author_info = git_commit["author"]
if not author_info.get("id"):
commit_author = None
log.info(
"Not trying to set an author because it does not have an id",
extra=dict(
author_info=author_info,
git_commit=git_commit,
commit=commit.commitid,
),
)
else:
commit_author = get_or_create_author(
db_session,
commit.repository.service,
author_info["id"],
author_info["username"],
author_info["email"],
author_info["name"],
)
return

Check warning on line 194 in services/repository.py

View check run for this annotation

Codecov Notifications / codecov/patch

services/repository.py#L194

Added line #L194 was not covered by tests

# attempt to populate commit.pullid from repository_service if we don't have it
if not commit.pullid:
commit.pullid = await repository_service.find_pull_request(
commit=commitid, branch=commit.branch
)
log.debug("Found git commit", extra=dict(commit=git_commit))

# if our records or the call above returned a pullid, fetch it's details
if commit.pullid:
commit_updates = await repository_service.get_pull_request(
pullid=commit.pullid
)
# There's a chance that the commit comes from a fork
# so we append the branch name with the fork slug
branch_name = commit_updates["head"]["branch"]
# TODO: 'slug' is in a `.get` because currently only GitHub returns that info
if commit_updates["head"].get("slug") != commit_updates["base"].get("slug"):
branch_name = commit_updates["head"]["slug"] + ":" + branch_name
commit.branch = branch_name
commit.merged = False
else:
possible_branches = await repository_service.get_best_effort_branches(
commit.commitid
)
if commit.repository.branch in possible_branches:
commit.merged = True
commit.branch = commit.repository.branch
else:
commit.merged = False
commit.message = git_commit["message"]
commit.parent_commit_id = await fetch_appropriate_parent_for_commit(
repository_service, commit, git_commit
)
commit.author = commit_author
commit.updatestamp = datetime.now()
commit.timestamp = git_commit["timestamp"]

if commit.repository.service == "bitbucket":
res = merged_pull(git_commit["message"])
if res:
pullid = res.groups()[0]
pullid = pullid
commit.branch = (await repository_service.get_pull_request(pullid))[
"base"
]["branch"]
author_info = git_commit["author"]
if not author_info.get("id"):
commit_author = None
log.info(
"Updated commit with info from git provider",
"Not trying to set an author because it does not have an id",
extra=dict(
repoid=commit.repoid,
author_info=author_info,
git_commit=git_commit,
commit=commit.commitid,
branch_value=commit.branch,
author_value=commit.author_id,
),
)
else:
commit_author = get_or_create_author(
db_session,
commit.repository.service,
author_info["id"],
author_info["username"],
author_info["email"],
author_info["name"],
)

commit.parent_commit_id = await fetch_appropriate_parent_for_commit(
repository_service, commit, git_commit
)
commit.message = git_commit["message"]
commit.author = commit_author
commit.updatestamp = datetime.now()
commit.timestamp = git_commit["timestamp"]

# attempt to populate commit.pullid from repository_service if we don't have it
if not commit.pullid:
commit.pullid = await repository_service.find_pull_request(
commit=commitid, branch=commit.branch
)

# if our records or the call above returned a pullid, fetch it's details
if commit.pullid:
pull_details = await repository_service.get_pull_request(pullid=commit.pullid)
# There's a chance that the commit comes from a fork
# so we append the branch name with the fork slug
branch_name = pull_details["head"]["branch"]
# TODO: 'slug' is in a `.get` because currently only GitHub returns that info
if pull_details["head"].get("slug") != pull_details["base"].get("slug"):
branch_name = pull_details["head"]["slug"] + ":" + branch_name
commit.branch = branch_name
commit.merged = False
else:
possible_branches = await repository_service.get_best_effort_branches(
commit.commitid
)
if commit.repository.branch in possible_branches:
commit.merged = True
commit.branch = commit.repository.branch
else:
commit.merged = False

if commit.repository.service == "bitbucket":
res = merged_pull(git_commit["message"])
if res:
pullid = res.groups()[0]
if int(pullid) != int(commit.pullid):
pull_details = await repository_service.get_pull_request(pullid)
commit.branch = pull_details["base"]["branch"]

db_session.flush()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for my owner understanding, I'm curious to how this flush helps with reducing the number of updates. I had assumed that we had to either flush (or commit, which does a flush first) for the update to "happen" in the transaction, so I might have the wrong mental model on how it works!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The number of updates are reduced by moving code around.
Previously, some fields on the current Commit were written, and those were apparently implicitly flushed by sqalchemy as another function was querying Commits.
Now the queries happen first, and all the field updates second, whereby the UPDATE should update all the fields at once.

The flush is just there to force the UPDATE to happen inside of this function, as previously, it would be delayed till later on nested in a completely different call chain.

Though to be quite honest, I haven’t verified this behavior locally. We will see this more clearly in tracing.

log.info(
"Updated commit with info from git provider",
extra=dict(
repoid=commit.repoid,
commit=commit.commitid,
branch_value=commit.branch,
author_value=commit.author_id,
),
)


def get_or_create_author(
Expand Down
3 changes: 1 addition & 2 deletions tasks/commit_update.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging

from asgiref.sync import async_to_sync
from shared.celery_config import commit_update_task_name
from shared.torngit.exceptions import TorngitClientError, TorngitRepoNotFoundError

Expand Down Expand Up @@ -41,7 +40,7 @@ def run_impl(
repository_service = get_repo_provider_service(
repository, installation_name_to_use=installation_name_to_use
)
was_updated = async_to_sync(possibly_update_commit_from_provider_info)(
was_updated = possibly_update_commit_from_provider_info(
commit, repository_service
)
except RepositoryWithoutValidBotError:
Expand Down
3 changes: 1 addition & 2 deletions tasks/preprocess_upload.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
from typing import Optional

from asgiref.sync import async_to_sync
from redis.exceptions import LockError
from shared.torngit.base import TorngitBaseAdapter

Expand Down Expand Up @@ -106,7 +105,7 @@ def process_impl_within_lock(
}
# Makes sure that we can properly carry forward reports
# By populating the commit info (if needed)
updated_commit = async_to_sync(possibly_update_commit_from_provider_info)(
updated_commit = possibly_update_commit_from_provider_info(
commit=commit, repository_service=repository_service
)
commit_yaml = fetch_commit_yaml_and_possibly_store(commit, repository_service)
Expand Down
2 changes: 1 addition & 1 deletion tasks/tests/unit/test_upload_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,7 @@ def test_upload_task_upload_already_created(
mock_storage,
):
mocked_schedule_task = mocker.patch.object(UploadTask, "schedule_task")
mock_possibly_update_commit_from_provider_info = mocker.patch(
mocker.patch(
"tasks.upload.possibly_update_commit_from_provider_info", return_value=True
)
mock_create_upload = mocker.patch.object(ReportService, "create_report_upload")
Expand Down
2 changes: 1 addition & 1 deletion tasks/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def run_impl_within_lock(
repository_service = get_repo_provider_service(
repository, installation_name_to_use=installation_name_to_use
)
was_updated = async_to_sync(possibly_update_commit_from_provider_info)(
was_updated = possibly_update_commit_from_provider_info(
commit, repository_service
)
was_setup = self.possibly_setup_webhooks(commit, repository_service)
Expand Down
Loading