diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 44bb7e19ae..db904daa39 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -1,6 +1,7 @@ import logging import traceback import sqlalchemy as s +from sqlalchemy.sql import text from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask @@ -9,8 +10,8 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id +from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, CollectionStatus +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine platform_id = 1 @@ -19,44 +20,87 @@ def collect_events(repo_git: str): logger = logging.getLogger(collect_events.__name__) - - try: - - repo_obj = get_repo_by_repo_git(repo_git) - repo_id = repo_obj.repo_id - owner, repo = get_owner_repo(repo_git) + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - logger.info(f"Collecting Github events for {owner}/{repo}") + owner, repo = get_owner_repo(repo_git) - key_auth = GithubRandomKeyAuth(logger) + logger.debug(f"Collecting Github events for {owner}/{repo}") - event_data = retrieve_all_event_data(repo_git, logger, key_auth) + key_auth = GithubRandomKeyAuth(logger) - if event_data: - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) - else: - logger.info(f"{owner}/{repo} has no events") + if bulk_events_collection_endpoint_contains_all_data(repo_id): + event_generator = bulk_collect_pr_and_issue_events(repo_git, logger, key_auth) + else: + event_generator = collect_pr_and_issues_events_by_number(repo_id, repo_git, logger, key_auth, f"{owner}/{repo}: Event task") + + events = [] + for event in event_generator: + events.append(event) + + # making this a decent size since process_events retrieves all the issues and prs each time + if len(events) >= 500: + process_events(events, f"{owner}/{repo}: Event task", repo_id, logger) + events.clear() + + if events: + process_events(events, f"{owner}/{repo}: Event task", repo_id, logger) + + +def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): + + url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" + + github_data_access = GithubDataAccess(key_auth, logger) - except Exception as e: - logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + page_count = github_data_access.get_resource_page_count(url) + if page_count > 300: + raise Exception(f"Either github raised the paginator page limit for things like events and messages, or is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") -def retrieve_all_event_data(repo_git: str, logger, key_auth): + return page_count != 300 + + +def bulk_collect_pr_and_issue_events(repo_git: str, logger, key_auth): owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting Github events for {owner}/{repo}") + logger.debug(f"Collecting Github events for {owner}/{repo}") url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" github_data_access = GithubDataAccess(key_auth, logger) - event_count = github_data_access.get_resource_page_count(url) + return github_data_access.paginate_resource(url) + + +def collect_pr_and_issues_events_by_number(repo_id, repo_git: str, logger, key_auth, task_name) -> None: + + owner, repo = get_owner_repo(repo_git) + + # define logger for task + logger.debug(f"Collecting github events for {owner}/{repo}") - logger.info(f"{owner}/{repo}: Collecting {event_count} github events") + engine = get_engine() - return list(github_data_access.paginate_resource(url)) + with engine.connect() as connection: + + query = text(f""" + (select pr_src_number as number from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select gh_issues_number as number from issues WHERE repo_id={repo_id} order by created_at desc); + """) + + result = connection.execute(query).fetchall() + numbers = [x[0] for x in result] + + github_data_access = GithubDataAccess(key_auth, logger) + for number in numbers: + + event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{number}/events" + + yield from github_data_access.paginate_resource(event_url) def process_events(events, task_name, repo_id, logger): @@ -104,9 +148,7 @@ def process_events(events, task_name, repo_id, logger): # query = augur_db.session.query(PullRequest).filter(PullRequest.pr_url == pr_url) # related_pr = execute_session_query(query, 'one') except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {pr_url}") - logger.info(f"{task_name}: Skipping") + logger.warning(f"{task_name}: Could not find related pr. We were searching for: {pr_url}") continue pr_event_dicts.append( @@ -122,9 +164,7 @@ def process_events(events, task_name, repo_id, logger): # query = augur_db.session.query(Issue).filter(Issue.issue_url == issue_url) # related_issue = execute_session_query(query, 'one') except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {issue_url}") - logger.info(f"{task_name}: Skipping") + logger.warning(f"{task_name}: Could not find related issue. We were searching for: {issue_url}") continue issue_event_dicts.append( diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 2f4c988014..850336f53c 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -65,17 +65,12 @@ def paginate_resource(self, url): return - def is_pagination_limited_by_max_github_pages(self, url): - - page_count = self.get_resource_page_count(url) - - return page_count <= 299 - def get_resource_page_count(self, url): response = self.make_request_with_retries(url, method="HEAD") if 'last' not in response.links.keys(): + self.logger.warning(f"Github response without links. Headers: {response.headers}.") return 1 try: