From dcba3c23031d13f8b54833c2d34c62cfabdb9f3b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Sun, 27 Oct 2024 14:02:21 -0400 Subject: [PATCH 1/5] fix libyear parse Signed-off-by: Isaac Milarsky --- .../libyear_util/pypi_parser.py | 6 +- .../libyear_util/util.py | 56 ++++++++++--------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index dab06b1a09..1824322f48 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -140,12 +140,12 @@ def parse_poetry_lock(file_handle): group = 'runtime' for package in manifest['package']: req = None - if package['category'] == 'main': + if package.get('category') == 'main': group = 'runtime' - if package['category'] == 'dev': + if package.get('category') == 'dev': group = 'develop' if 'version' in package: - req = package['version'] + req = package.get('version') elif 'git' in package: req = package['git']+'#'+package['ref'] Dict = {'name': package['name'], 'requirement': req, 'type': group, 'package': 'PYPI'} diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 111d3fc631..ffa2d4a84a 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -32,54 +32,56 @@ def get_parsed_deps(path,logger): deps_file = None dependency_list = list() - for f in file_list: deps_file = find(f, path) - if not deps_file: + + if not deps_file or not f: continue file_handle= open(deps_file) - if f == 'Requirement.txt': - dependency_list = parse_requirement_txt(file_handle) + short_file_name = os.path.split(deps_file)[-1] + + if short_file_name == 'Requirement.txt': + dependency_list.extend(parse_requirement_txt(file_handle)) - elif f == 'requirements.txt': - dependency_list = parse_requirement_txt(file_handle) + if short_file_name == 'requirements.txt': + dependency_list.extend(parse_requirement_txt(file_handle)) - elif f == 'setup.py': - dependency_list = parse_setup_py(file_handle) + if short_file_name == 'setup.py': + dependency_list.extend(parse_setup_py(file_handle)) - elif f == 'Pipfile': - dependency_list = parse_pipfile(file_handle) + if short_file_name == 'Pipfile': + dependency_list.extend(parse_pipfile(file_handle)) - elif f == 'Pipfile.lock': - dependency_list = parse_pipfile_lock(file_handle) + if short_file_name == 'Pipfile.lock': + dependency_list.extend(parse_pipfile_lock(file_handle)) - elif f == 'pyproject.toml': - dependency_list = parse_poetry(file_handle) + if short_file_name == 'pyproject.toml': + dependency_list.extend(parse_poetry(file_handle)) - elif f == 'poetry.lock': - dependency_list = parse_poetry_lock(file_handle) + if short_file_name == 'poetry.lock': + dependency_list.extend(parse_poetry_lock(file_handle)) - elif f == 'environment.yml': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yml': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yaml': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yaml': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yml.lock': - dependency_list = parse_conda(file_handle) + if f == 'environment.yml.lock': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'environment.yaml.lock': - dependency_list = parse_conda(file_handle) + if short_file_name == 'environment.yaml.lock': + dependency_list.extend(parse_conda(file_handle)) - elif f == 'package.json': + if short_file_name == 'package.json': try: - dependency_list = parse_package_json(file_handle) + dependency_list.extend(parse_package_json(file_handle)) except KeyError as e: logger.error(f"package.json for repo at path {path} is missing required key: {e}\n Skipping file...") - return dependency_list + return dependency_list def get_libyear(current_version, current_release_date, latest_version, latest_release_date): From 500730a5fd7d02f7b67cd2b8a25027977c7712cb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 13 Nov 2024 00:04:12 -0600 Subject: [PATCH 2/5] set closed at as merged at when closed at is missing --- augur/application/db/data_parse.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index f1d0fdff50..71bde0aa21 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -791,6 +791,11 @@ def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, t Returns: Parsed pr dict """ + pr_closed_datetime = pr['closed_at'] + pr_merged_datetime = pr['merged_at'] + + if not pr_closed_datetime: + pr_closed_datetime = pr_merged_datetime pr_dict = { 'repo_id': repo_id, @@ -810,8 +815,8 @@ def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, t 'pr_body': pr['description'], 'pr_created_at': pr['created_at'], 'pr_updated_at': pr['updated_at'], - 'pr_closed_at': pr['closed_at'], - 'pr_merged_at': pr['merged_at'], + 'pr_closed_at': pr_closed_datetime, + 'pr_merged_at': pr_merged_datetime, 'pr_merge_commit_sha': pr['merge_commit_sha'], 'pr_teams': None, 'pr_milestone': pr['milestone'].get('title') if pr['milestone'] else None, From fcf833fac44b240df0d48edca0645ac06bb6b682 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 14 Jan 2025 12:57:38 -0600 Subject: [PATCH 3/5] only collect messages on data that was updated --- augur/tasks/github/messages.py | 41 +++++++++++++++++++++++++--------- augur/tasks/start_tasks.py | 2 +- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index 86fbe054fa..c36abc24ae 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -1,5 +1,5 @@ import logging - +from datetime import timedelta, timezone from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask @@ -10,12 +10,13 @@ from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus from augur.application.db import get_engine, get_session +from augur.application.db.lib import get_core_data_last_collected from sqlalchemy.sql import text platform_id = 1 @celery.task(base=AugurCoreRepoCollectionTask) -def collect_github_messages(repo_git: str) -> None: +def collect_github_messages(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(collect_github_messages.__name__) @@ -29,9 +30,15 @@ def collect_github_messages(repo_git: str) -> None: owner, repo = get_owner_repo(repo_git) task_name = f"{owner}/{repo}: Message Task" + if full_collection: + core_data_last_collected = None + else: + # subtract 2 days to ensure all data is collected + core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + if is_repo_small(repo_id): - message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) + message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) if message_data: process_messages(message_data, task_name, repo_id, logger, augur_db) @@ -40,7 +47,7 @@ def collect_github_messages(repo_git: str) -> None: logger.info(f"{owner}/{repo} has no messages") else: - process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db) + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) def is_repo_small(repo_id): @@ -51,13 +58,16 @@ def is_repo_small(repo_id): return result != None -def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: +def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name, since) -> None: owner, repo = get_owner_repo(repo_git) # url to get issue and pull request comments url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + if since: + url += f"&since={since.isoformat()}" + # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") @@ -70,7 +80,7 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas return list(github_data_access.paginate_resource(url)) -def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db) -> None: +def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since) -> None: owner, repo = get_owner_repo(repo_git) @@ -81,11 +91,20 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger with engine.connect() as connection: - query = text(f""" - (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) - UNION - (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); - """) + if since: + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_updated_at > {since} order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND updated_at > {since} order by created_at desc); + """) + else: + + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); + """) + result = connection.execute(query).fetchall() comment_urls = [x[0] for x in result] diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ab4cf217ce..8aa767ece6 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -74,7 +74,7 @@ def primary_repo_collect_phase(repo_git, full_collection): #Define secondary group that can't run until after primary jobs have finished. secondary_repo_jobs = group( collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, - collect_github_messages.si(repo_git), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, + collect_github_messages.si(repo_git, full_collection), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, collect_github_repo_clones_data.si(repo_git), ) From e99588de3b97ac2553ac8cf81773498174020c49 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 14 Jan 2025 18:26:20 -0600 Subject: [PATCH 4/5] change & to ? --- augur/tasks/github/messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index c36abc24ae..4598e120ff 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -66,7 +66,7 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" if since: - url += f"&since={since.isoformat()}" + url += f"?since={since.isoformat()}" # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") From 34994f0c98ea068ea70b3f0a7ec06ea532b2bded Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 14 Jan 2025 18:35:00 -0600 Subject: [PATCH 5/5] fix sql synax error --- augur/tasks/github/messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index 4598e120ff..7f1e63ea8c 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -93,9 +93,9 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger if since: query = text(f""" - (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_updated_at > {since} order by pr_created_at desc) + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_updated_at > timestamptz(timestamp '{since}') order by pr_created_at desc) UNION - (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND updated_at > {since} order by created_at desc); + (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND updated_at > timestamptz(timestamp '{since}') order by created_at desc); """) else: