From 80a81385d089f8b7db0fa45422b715fbc20d6c58 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Tue, 15 Oct 2024 13:39:37 +0200 Subject: [PATCH] Cache and move `load_commit_diff` out of Report lock Moves the `load_commit_diff` portion out of `save_report_results`. This way, the whole commit diff loading can be moved outside of the Report merge lock. Additionally, the function was decorated with a cache, as the diff for a commit will never change. --- tasks/tests/unit/test_upload_finisher_task.py | 2 +- .../tests/unit/test_upload_processing_task.py | 141 +++++--------- tasks/upload_finisher.py | 19 +- tasks/upload_processor.py | 175 +++++++++--------- 4 files changed, 146 insertions(+), 191 deletions(-) diff --git a/tasks/tests/unit/test_upload_finisher_task.py b/tasks/tests/unit/test_upload_finisher_task.py index 235de3a31..32321737d 100644 --- a/tasks/tests/unit/test_upload_finisher_task.py +++ b/tasks/tests/unit/test_upload_finisher_task.py @@ -649,7 +649,7 @@ def test_retry_on_report_lock(self, dbsession, mock_redis): dbsession, [ { - "processings_so_far": [{"successful": True}], + "processings_so_far": [{"successful": True, "arguments": {}}], "parallel_incremental_result": {"upload_pk": 1}, } ], diff --git a/tasks/tests/unit/test_upload_processing_task.py b/tasks/tests/unit/test_upload_processing_task.py index bb5624c1d..2f22fa7e5 100644 --- a/tasks/tests/unit/test_upload_processing_task.py +++ b/tasks/tests/unit/test_upload_processing_task.py @@ -27,7 +27,11 @@ SessionAdjustmentResult, UploadProcessingResult, ) -from tasks.upload_processor import UploadProcessorTask +from tasks.upload_processor import ( + UploadProcessorTask, + load_commit_diff, + save_report_results, +) here = Path(__file__) @@ -484,7 +488,8 @@ def test_upload_task_call_exception_within_individual_upload( "parse_raw_report_from_storage", return_value="ParsedRawReport()", ) - mocker.patch.object(UploadProcessorTask, "save_report_results") + mocker.patch("tasks.upload_processor.load_commit_diff") + mocker.patch("tasks.upload_processor.save_report_results") mocked_post_process = mocker.patch.object( UploadProcessorTask, "postprocess_raw_reports" @@ -1009,7 +1014,7 @@ def test_upload_task_call_celeryerror( assert commit.state == "pending" def test_save_report_results_apply_diff_not_there( - self, mocker, mock_configuration, dbsession, mock_repo_provider, mock_storage + self, mocker, mock_configuration, dbsession, mock_storage ): commit = CommitFactory.create( message="", @@ -1033,68 +1038,38 @@ def test_save_report_results_apply_diff_not_there( report.append(report_file_1) report.append(report_file_2) chunks_archive_service = ArchiveService(commit.repository) - mock_repo_provider.get_commit_diff.side_effect = TorngitObjectNotFoundError( - "response", "message" - ) - result = UploadProcessorTask().save_report_results( - db_session=dbsession, - report_service=ReportService({}), - repository=commit.repository, - commit=commit, - report=report, - pr=None, - ) - expected_result = { + result = save_report_results(ReportService({}), commit, report, None, None) + + assert result == { "url": f"v4/repos/{chunks_archive_service.storage_hash}/commits/{commit.commitid}/chunks.txt" } - assert expected_result == result assert report.diff_totals is None - def test_save_report_results_apply_diff_no_bot( - self, mocker, mock_configuration, dbsession, mock_repo_provider, mock_storage + def test_load_commit_diff_no_diff( + self, mocker, mock_configuration, dbsession, mock_repo_provider ): - commit = CommitFactory.create( - message="", - repository__owner__unencrypted_oauth_token="testulk3d54rlhxkjyzomq2wh8b7np47xabcrkx8", - repository__owner__username="ThiagoCodecov", - repository__yaml={ - "codecov": {"max_report_age": "1y ago"} - }, # Sorry for the timebomb + commit = CommitFactory.create() + dbsession.add(commit) + dbsession.flush() + mock_repo_provider.get_commit_diff.side_effect = TorngitObjectNotFoundError( + "response", "message" ) + diff = load_commit_diff(commit, None, None) + assert diff is None + + def test_load_commit_diff_no_bot(self, mocker, mock_configuration, dbsession): + commit = CommitFactory.create() + dbsession.add(commit) + dbsession.flush() mock_get_repo_service = mocker.patch( "tasks.upload_processor.get_repo_provider_service" ) mock_get_repo_service.side_effect = RepositoryWithoutValidBotError() - dbsession.add(commit) - dbsession.flush() - report = Report() - report_file_1 = ReportFile("path/to/first.py") - report_file_2 = ReportFile("to/second/path.py") - report_line_1 = ReportLine.create(coverage=1, sessions=[[0, 1]]) - report_line_2 = ReportLine.create(coverage=0, sessions=[[0, 0]]) - report_line_3 = ReportLine.create(coverage=1, sessions=[[0, 1]]) - report_file_1.append(10, report_line_1) - report_file_1.append(12, report_line_2) - report_file_2.append(12, report_line_3) - report.append(report_file_1) - report.append(report_file_2) - chunks_archive_service = ArchiveService(commit.repository) - result = UploadProcessorTask().save_report_results( - db_session=dbsession, - report_service=ReportService({}), - repository=commit.repository, - commit=commit, - report=report, - pr=None, - ) - expected_result = { - "url": f"v4/repos/{chunks_archive_service.storage_hash}/commits/{commit.commitid}/chunks.txt" - } - assert expected_result == result - assert report.diff_totals is None + diff = load_commit_diff(commit, None, None) + assert diff is None def test_save_report_results_apply_diff_valid( - self, mocker, mock_configuration, dbsession, mock_repo_provider, mock_storage + self, mocker, mock_configuration, dbsession, mock_storage ): commit = CommitFactory.create( message="", @@ -1118,7 +1093,7 @@ def test_save_report_results_apply_diff_valid( report.append(report_file_1) report.append(report_file_2) chunks_archive_service = ArchiveService(commit.repository) - f = { + diff = { "files": { "path/to/first.py": { "type": "modified", @@ -1139,20 +1114,11 @@ def test_save_report_results_apply_diff_valid( } } } - mock_repo_provider.get_commit_diff.return_value = f - result = UploadProcessorTask().save_report_results( - db_session=dbsession, - report_service=ReportService({}), - commit=commit, - repository=commit.repository, - report=report, - pr=None, - ) - expected_result = { + result = save_report_results(ReportService({}), commit, report, diff, None) + assert result == { "url": f"v4/repos/{chunks_archive_service.storage_hash}/commits/{commit.commitid}/chunks.txt" } - assert expected_result == result - expected_diff_totals = ReportTotals( + assert report.diff_totals == ReportTotals( files=1, lines=1, hits=1, @@ -1167,10 +1133,9 @@ def test_save_report_results_apply_diff_valid( complexity_total=0, diff=0, ) - assert report.diff_totals == expected_diff_totals def test_save_report_results_empty_report( - self, mocker, mock_configuration, dbsession, mock_repo_provider, mock_storage + self, mocker, mock_configuration, dbsession, mock_storage ): commit = CommitFactory.create( message="", @@ -1184,7 +1149,7 @@ def test_save_report_results_empty_report( dbsession.flush() report = Report() chunks_archive_service = ArchiveService(commit.repository) - f = { + diff = { "files": { "path/to/first.py": { "type": "modified", @@ -1205,20 +1170,12 @@ def test_save_report_results_empty_report( } } } - mock_repo_provider.get_commit_diff.return_value = f - result = UploadProcessorTask().save_report_results( - db_session=dbsession, - report_service=ReportService({}), - commit=commit, - repository=commit.repository, - report=report, - pr=None, - ) - expected_result = { + result = save_report_results(ReportService({}), commit, report, diff, None) + + assert result == { "url": f"v4/repos/{chunks_archive_service.storage_hash}/commits/{commit.commitid}/chunks.txt" } - assert expected_result == result - expected_diff_totals = ReportTotals( + assert report.diff_totals == ReportTotals( files=0, lines=0, hits=0, @@ -1233,7 +1190,6 @@ def test_save_report_results_empty_report( complexity_total=None, diff=0, ) - assert report.diff_totals == expected_diff_totals assert commit.state == "error" @pytest.mark.parametrize( @@ -1244,7 +1200,6 @@ def test_save_report_results_pr_values( mocker, mock_configuration, dbsession, - mock_repo_provider, mock_storage, pr_value, expected_pr_result, @@ -1253,17 +1208,15 @@ def test_save_report_results_pr_values( dbsession.add(commit) dbsession.flush() report = mocker.Mock() - mock_repo_provider.get_commit_diff.return_value = { - "files": {"path/to/first.py": {}} - } + diff = {"files": {"path/to/first.py": {}}} mock_report_service = mocker.Mock(save_report=mocker.Mock(return_value="aaaa")) - result = UploadProcessorTask().save_report_results( - db_session=dbsession, - report_service=mock_report_service, - commit=commit, - repository=commit.repository, - report=report, - pr=pr_value, + result = save_report_results( + mock_report_service, + commit, + report, + diff, + pr_value, ) - assert "aaaa" == result + + assert result == "aaaa" assert commit.pullid == expected_pr_result diff --git a/tasks/upload_finisher.py b/tasks/upload_finisher.py index cfd7f40eb..74e35b007 100644 --- a/tasks/upload_finisher.py +++ b/tasks/upload_finisher.py @@ -46,7 +46,8 @@ from tasks.upload_processor import ( MAX_RETRIES, UPLOAD_PROCESSING_LOCK_NAME, - UploadProcessorTask, + load_commit_diff, + save_report_results, ) log = logging.getLogger(__name__) @@ -158,6 +159,8 @@ def run_impl( int(upload["upload_pk"]) for upload in processing_results["parallel_incremental_result"] ] + pr = processing_results["processings_so_far"][0]["arguments"].get("pr") + diff = load_commit_diff(commit, pr, self.name) try: report_lock = ( @@ -188,18 +191,8 @@ def run_impl( ) if parallel_processing is ParallelProcessing.PARALLEL: - pr = processing_results["processings_so_far"][0][ - "arguments" - ].get("pr") - processor_task = UploadProcessorTask() - processor_task.save_report_results( - db_session, - report_service, - repository, - commit, - report, - pr, - report_code, + save_report_results( + report_service, commit, report, diff, pr, report_code ) state.mark_uploads_as_merged(upload_ids) diff --git a/tasks/upload_processor.py b/tasks/upload_processor.py index e5a2c888b..5ca489457 100644 --- a/tasks/upload_processor.py +++ b/tasks/upload_processor.py @@ -15,7 +15,8 @@ from app import celery_app from database.enums import CommitErrorTypes from database.models import Commit, Upload -from database.models.core import Pull, Repository +from database.models.core import GITHUB_APP_INSTALLATION_DEFAULT_NAME, Pull +from helpers.cache import cache from helpers.exceptions import RepositoryWithoutValidBotError from helpers.github_installation import get_installation_name_for_owner_for_task from helpers.parallel import ParallelProcessing @@ -305,14 +306,9 @@ def process_upload( ), ) else: - results_dict = self.save_report_results( - db_session, - report_service, - repository, - commit, - report, - pr, - report_code, + diff = load_commit_diff(commit, pr, self.name) + results_dict = save_report_results( + report_service, commit, report, diff, pr, report_code ) for upload_id in upload_ids: state.mark_upload_as_processed(upload_id) @@ -440,83 +436,96 @@ def postprocess_raw_reports( archive_url, report_info.raw_report.content().getvalue() ) - @sentry_sdk.trace - def save_report_results( - self, - db_session, - report_service: ReportService, - repository: Repository, - commit: Commit, - report: Report, - pr: Pull, - report_code=None, - ): - """Saves the result of `report` to the commit database and chunks archive - This method only takes care of getting a processed Report to the database and archive. +RegisteredUploadTask = celery_app.register_task(UploadProcessorTask()) +upload_processor_task = celery_app.tasks[RegisteredUploadTask.name] - It also tries to calculate the diff of the report (which uses commit info - from th git provider), but it it fails to do so, it just moves on without such diff - """ - log.debug("In save_report_results for commit: %s" % commit) - commitid = commit.commitid - try: - installation_name_to_use = get_installation_name_for_owner_for_task( - self.name, repository.owner - ) - repository_service = get_repo_provider_service( - repository, installation_name_to_use=installation_name_to_use - ) - report.apply_diff( - async_to_sync(repository_service.get_commit_diff)(commitid) - ) - except TorngitError: - # When this happens, we have that commit.totals["diff"] is not available. - # Since there is no way to calculate such diff without the git commit, - # then we assume having the rest of the report saved there is better than the - # alternative of refusing an otherwise "good" report because of the lack of diff - log.warning( - "Could not apply diff to report because there was an error fetching diff from provider", - extra=dict( - repoid=commit.repoid, - commit=commit.commitid, - parent_task=self.request.parent_id, - ), - exc_info=True, - ) - except RepositoryWithoutValidBotError: - save_commit_error( - commit, - error_code=CommitErrorTypes.REPO_BOT_INVALID.value, - error_params=dict( - repoid=commit.repoid, - pr=pr, - ), - ) +@sentry_sdk.trace +@cache.cache_function(ttl=60 * 60) # the commit diff is immutable +def load_commit_diff( + commit: Commit, pr: Pull | None, task_name: str | None +) -> dict | None: + repository = commit.repository + commitid = commit.commitid + try: + installation_name_to_use = ( + get_installation_name_for_owner_for_task(task_name, repository.owner) + if task_name + else GITHUB_APP_INSTALLATION_DEFAULT_NAME + ) + repository_service = get_repo_provider_service( + repository, installation_name_to_use=installation_name_to_use + ) + return async_to_sync(repository_service.get_commit_diff)(commitid) + + # TODO: can we maybe get rid of all this logging? + except TorngitError: + # When this happens, we have that commit.totals["diff"] is not available. + # Since there is no way to calculate such diff without the git commit, + # then we assume having the rest of the report saved there is better than the + # alternative of refusing an otherwise "good" report because of the lack of diff + log.warning( + "Could not apply diff to report because there was an error fetching diff from provider", + extra=dict( + repoid=commit.repoid, + commit=commit.commitid, + ), + exc_info=True, + ) + except RepositoryWithoutValidBotError: + save_commit_error( + commit, + error_code=CommitErrorTypes.REPO_BOT_INVALID.value, + error_params=dict( + repoid=commit.repoid, + pr=pr, + ), + ) + + log.warning( + "Could not apply diff to report because there is no valid bot found for that repo", + extra=dict( + repoid=commit.repoid, + commit=commit.commitid, + ), + exc_info=True, + ) + + return None + + +@sentry_sdk.trace +def save_report_results( + report_service: ReportService, + commit: Commit, + report: Report, + diff: dict | None, + # TODO: maybe remove this parameter, as its only used to update `commit`: + pr: Pull | None, + report_code=None, +): + """Saves the result of `report` to the commit database and chunks archive + + This method only takes care of getting a processed Report to the database and archive. + + It also tries to calculate the diff of the report (which uses commit info + from th git provider), but it it fails to do so, it just moves on without such diff + """ + log.debug("In save_report_results for commit: %s" % commit) + + if diff: + report.apply_diff(diff) + + if pr is not None: + try: + commit.pullid = int(pr) + except (ValueError, TypeError): log.warning( - "Could not apply diff to report because there is no valid bot found for that repo", - extra=dict( - repoid=commit.repoid, - commit=commit.commitid, - parent_task=self.request.parent_id, - ), - exc_info=True, + "Cannot set PR value on commit", + extra=dict(repoid=commit.repoid, commit=commit.commitid, pr_value=pr), ) - if pr is not None: - try: - commit.pullid = int(pr) - except (ValueError, TypeError): - log.warning( - "Cannot set PR value on commit", - extra=dict( - repoid=commit.repoid, commit=commit.commitid, pr_value=pr - ), - ) - res = report_service.save_report(commit, report, report_code) - db_session.commit() - return res - -RegisteredUploadTask = celery_app.register_task(UploadProcessorTask()) -upload_processor_task = celery_app.tasks[RegisteredUploadTask.name] + res = report_service.save_report(commit, report, report_code) + commit.get_db_session().commit() + return res