From 20bd29d592dda65946bee89ceca0d81167215c0d Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 4 Dec 2023 14:36:31 -0600 Subject: [PATCH 01/25] Implement gitlab issues labels and assignees Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 49 ++++++++++++++++++++++++++ augur/tasks/github/issues/tasks.py | 2 +- augur/tasks/gitlab/issues_task.py | 56 +++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 4c618860ba..cf11f4341b 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -247,6 +247,28 @@ def extract_needed_issue_assignee_data(assignees: List[dict], repo_id: int, tool return assignee_dicts +def extract_needed_gitlab_issue_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + "cntrb_id": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "issue_assignee_src_id": assignee['id'], + "issue_assignee_src_node": None, + "repo_id": repo_id + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + # retrieve only the needed data for pr labels from the api response @@ -277,6 +299,33 @@ def extract_needed_issue_label_data(labels: List[dict], repo_id: int, tool_sourc return label_dicts +def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + print(f"Processing repo id for issue label: {repo_id}") + + label_dict = { + "label_text": label["name"], + "label_description": label.get("description", None), + "label_color": label['color'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "label_src_id": label['id'], + "label_src_node_id": None, + "repo_id": repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + # retrieve only the needed data for pr labels from the api response def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 5380b8bf10..0ba793470e 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -195,7 +195,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], "issue_id", issue_id) - logger.info(f"{task_name}: Inserting other issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + logger.info(f"{task_name}: Inserting other github issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") # inserting issue labels # we are using label_src_id and issue_id to determine if the label is already in the database. diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index a5a55f353a..612a5d36c4 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -5,9 +5,9 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_paginator import GitlabPaginator from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Issue, Repo +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Repo from augur.application.db.util import execute_session_query @@ -80,12 +80,26 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: data_source = "Gitlab API" issue_dicts = [] + issue_mapping_data = {} for issue in issues: issue_dicts.append( extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) ) + issue_labels = extract_needed_gitlab_issue_label_data(issue["labels"], repo_id, + tool_source, tool_version, data_source) + + issue_assignees = extract_needed_gitlab_issue_assignee_data(issue["assignees"], repo_id, + tool_source, tool_version, data_source) + + mapping_data_key = issue["id"] + issue_mapping_data[mapping_data_key] = { + "labels": issue_labels, + "assignees": issue_assignees, + } + + if len(issue_dicts) == 0: print("No gitlab issues found while processing") return @@ -93,5 +107,39 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] + issue_return_columns = ["gh_issue_id", "issue_id"] + + issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + + issue_label_dicts = [] + issue_assignee_dicts = [] + for data in issue_return_data: + + gh_issue_id = data["gh_issue_id"] + issue_id = data["issue_id"] + + try: + other_issue_data = issue_mapping_data[gh_issue_id] + except KeyError as e: + logger.info(f"{task_name}: Cold not find other gitlab issue data. This should never happen. Error: {e}") + + + # add the issue id to the lables and assignees, then add them to a list of dicts that will be inserted soon + dict_key = "issue_id" + issue_label_dicts += add_key_value_pair_to_dicts(other_issue_data["labels"], dict_key, issue_id) + issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], dict_key, issue_id) + + + logger.info(f"{task_name}: Inserting other gitlab issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + + # inserting issue labels + # we are using label_src_id and issue_id to determine if the label is already in the database. + issue_label_natural_keys = ['label_src_id', 'issue_id'] + issue_label_string_fields = ["label_text", "label_description"] + augur_db.insert_data(issue_label_dicts, IssueLabel, + issue_label_natural_keys, string_fields=issue_label_string_fields) - augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, string_fields=issue_string_columns) \ No newline at end of file + # inserting issue assignees + # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. + issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) \ No newline at end of file From 11340d4985bf0634f0b10573cedef381b8e9791b Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 4 Dec 2023 14:42:16 -0600 Subject: [PATCH 02/25] Missed in last commit Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 612a5d36c4..8e40e65f13 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -50,7 +50,7 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: logger.info(f"Collecting gitlab issues for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues" + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" issues = GitlabPaginator(url, key_auth, logger) all_data = [] From de4217e4c13e515a72276e3ab950bb1bd112d675 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 8 Dec 2023 13:24:57 -0600 Subject: [PATCH 03/25] Fix small bugs in assignees and labels for mrs --- augur/application/db/data_parse.py | 57 +++++++++++++++++- .../application/db/models/augur_operations.py | 18 +++--- augur/tasks/gitlab/merge_request_task.py | 58 +++++++++++++++---- augur/tasks/start_tasks.py | 6 +- 4 files changed, 112 insertions(+), 27 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index cf11f4341b..6cc94494ae 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -37,6 +37,34 @@ def extract_needed_pr_label_data(labels: List[dict], repo_id: int, tool_source: return label_dicts + +def extract_needed_mr_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + 'pr_src_id': label['id'], + 'pr_src_node_id': None, + 'pr_src_url': None, + 'pr_src_description': label['name'], + 'pr_src_color': label['color'], + # TODO: Populate this by making an api call for each label + 'pr_src_default_bool': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'repo_id': repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + # retrieve only the needed data for pr assignees from the api response def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: @@ -48,7 +76,6 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so for assignee in assignees: assignee_dict = { - # store the pr_url data on in the pr assignee data for now so we can relate it back to a pr later 'contrib_id': assignee["cntrb_id"], 'pr_assignee_src_id': int(assignee['id']), 'tool_source': tool_source, @@ -61,6 +88,30 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so return assignee_dicts +def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + 'contrib_id': None, + 'repo_id': repo_id, + # TODO: Temporarily setting this to id which the id of the contributor, unitl we can get the contrib_id set and create a unique on the contrib_id and the pull_request_id + 'pr_assignee_src_id': assignee["id"], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + + + # retrieve only the needed data for pr reviewers from the api response def extract_needed_pr_reviewer_data(reviewers: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: @@ -307,8 +358,6 @@ def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, too label_dicts = [] for label in labels: - print(f"Processing repo id for issue label: {repo_id}") - label_dict = { "label_text": label["name"], "label_description": label.get("description", None), @@ -640,3 +689,5 @@ def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_ return issue_dict + + diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 0b9a4ee4c4..47f28b12f2 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1245,15 +1245,15 @@ def insert(session, repo_id): github_weight = None session.logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - - try: - #pr_issue_count = 0 - github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) - except Exception as e: - pr_issue_count = None - github_weight = None - session.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) + else: + try: + pr_issue_count = 0 + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) record = { diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index cfe849ba8a..5e6597a678 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,9 +4,9 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_paginator import GitlabPaginator from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Repo +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, Repo @celery.task(base=AugurCoreRepoCollectionTask) @@ -40,7 +40,7 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: logger.info(f"Collecting pull requests for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests" + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" mrs = GitlabPaginator(url, key_auth, logger) all_data = [] @@ -67,19 +67,53 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): tool_source = "Mr Task" tool_version = "2.0" - merge_requests = extract_needed_mr_data(data, repo_id, tool_source, tool_version) + data_source = "Gitlab API" + + merge_requests = [] + mr_mapping_data = {} + for mr in data: + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) + + assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) + + labels = extract_needed_mr_label_data(mr["labels"], repo_id, tool_source, tool_version, data_source) + + mapping_data_key = mr["id"] + mr_mapping_data[mapping_data_key] = { + "assignees": assignees, + "labels": labels + } logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, string_fields=pr_string_fields) + pr_return_columns = ["pull_request_id", "pr_src_id"] + pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + + + mr_assignee_dicts = [] + mr_label_dicts = [] + for data in pr_return_data: + + mr_src_id = data["pr_src_id"] + pull_request_id = data["pull_request_id"] + + try: + other_mr_data = mr_mapping_data[mr_src_id] + except KeyError as e: + logger.info(f"Cold not find other pr data. This should never happen. Error: {e}") + + dict_key = "pull_request_id" + mr_assignee_dicts += add_key_value_pair_to_dicts(other_mr_data["assignees"], dict_key, pull_request_id) + mr_label_dicts += add_key_value_pair_to_dicts(other_mr_data["labels"], dict_key, pull_request_id) + logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") -def extract_needed_mr_data(mrs, repo_id, tool_source, tool_version): - - data = [] - for mr in mrs: - data.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) + # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data + mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) - return data + pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] + pr_label_string_fields = ["pr_src_description"] + augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index b99032bb82..e837a65f6b 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -95,9 +95,9 @@ def primary_repo_collect_phase(repo_git): return repo_task_group -def primary_gitlab_repo_collect_phase(repo_git): +def primary_repo_collect_phase_gitlab(repo_git): - logger = logging.getLogger(primary_gitlab_repo_collect_phase.__name__) + logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) jobs = group( collect_gitlab_merge_requests.si(repo_git), @@ -166,7 +166,7 @@ def build_primary_repo_collect_request(session,enabled_phase_names, days_until_c primary_enabled_phases.append(prelim_phase) primary_enabled_phases.append(primary_repo_collect_phase) - primary_gitlab_enabled_phases.append(primary_gitlab_repo_collect_phase) + primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. def core_task_success_util_gen(repo_git): From 924a7adb50aa8e3c6c07cd4a2294012597fc6948 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 8 Dec 2023 15:02:01 -0600 Subject: [PATCH 04/25] Setup passing of mr ids to next tasks Signed-off-by: Andrew Brain --- augur/tasks/gitlab/merge_request_task.py | 57 ++++++++++++++++++++++-- augur/tasks/start_tasks.py | 11 ++++- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 5e6597a678..ae3b95e41f 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -26,12 +26,12 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) if mr_data: - process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) - return len(mr_data) + return mr_ids else: logger.info(f"{owner}/{repo} has no merge requests") - return 0 + return [] def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: @@ -70,8 +70,12 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): data_source = "Gitlab API" merge_requests = [] + mr_ids = [] mr_mapping_data = {} for mr in data: + + mr_ids.append(mr["iid"]) + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) @@ -117,3 +121,50 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): pr_label_string_fields = ["pr_src_description"] augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + return mr_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_comments(mr_ids, repo_git) -> int: + + print("Collect merge request comments") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_events(mr_ids, repo_git) -> int: + + print("Collect merge request events") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_metadata(mr_ids, repo_git) -> int: + + print("Collect merge request metadata") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_reviewers(mr_ids, repo_git) -> int: + + print("Collect merge request reviewers") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_commits(mr_ids, repo_git) -> int: + + print("Collect merge request commits") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_files(mr_ids, repo_git) -> int: + + print("Collect merge request files") + # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + + + \ No newline at end of file diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index e837a65f6b..251da00f2a 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -24,7 +24,7 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data -from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_events, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files from augur.tasks.gitlab.issues_task import collect_gitlab_issues from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * @@ -100,7 +100,14 @@ def primary_repo_collect_phase_gitlab(repo_git): logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) jobs = group( - collect_gitlab_merge_requests.si(repo_git), + chain(collect_gitlab_merge_requests.si(repo_git), group( + collect_merge_request_comments.s(repo_git), + collect_merge_request_events.s(repo_git), + collect_merge_request_reviewers.s(repo_git), + collect_merge_request_metadata.s(repo_git), + collect_merge_request_commits.s(repo_git), + collect_merge_request_files.s(repo_git) + )), collect_gitlab_issues.si(repo_git) ) From 5a80cf269667a133c12a1554adb2dcda98ad3fdd Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 8 Dec 2023 15:14:28 -0600 Subject: [PATCH 05/25] Setup issues tasks to pass ids to next tasks Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 27 ++++++++++++++++++++++----- augur/tasks/start_tasks.py | 7 +++++-- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 8e40e65f13..75635eecd1 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -31,13 +31,12 @@ def collect_gitlab_issues(repo_git : str) -> int: issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) if issue_data: - total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - return total_issues + return issue_ids else: logger.info(f"{owner}/{repo} has no issues") - return 0 + return [] except Exception as e: logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 @@ -80,9 +79,12 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: data_source = "Gitlab API" issue_dicts = [] + issue_ids = [] issue_mapping_data = {} for issue in issues: + issue_ids.append(issue["iid"]) + issue_dicts.append( extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) ) @@ -142,4 +144,19 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) \ No newline at end of file + augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + + return issue_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_issue_comments(issue_ids, repo_git) -> int: + + print(f"Collect issue comments. Repo git: {repo_git}. Len ids: {issue_ids}") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_issue_events(issue_ids, repo_git) -> int: + + print(f"Collect issue events. Repo git: {repo_git}. Len ids: {issue_ids}") \ No newline at end of file diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 251da00f2a..86216c94ee 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -25,7 +25,7 @@ from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_events, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files -from augur.tasks.gitlab.issues_task import collect_gitlab_issues +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_issue_comments, collect_issue_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * @@ -108,7 +108,10 @@ def primary_repo_collect_phase_gitlab(repo_git): collect_merge_request_commits.s(repo_git), collect_merge_request_files.s(repo_git) )), - collect_gitlab_issues.si(repo_git) + chain(collect_gitlab_issues.si(repo_git), group( + collect_issue_comments.s(repo_git), + collect_issue_events.s(repo_git) + )) ) return jobs From 57ce5deb422663c1df426adc7af54b77e298468e Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 8 Dec 2023 16:33:42 -0600 Subject: [PATCH 06/25] Add collection of gitlab events & comments Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 87 ++++++++++++++++++++++-- augur/tasks/gitlab/merge_request_task.py | 20 +++--- augur/tasks/start_tasks.py | 8 +-- 3 files changed, 97 insertions(+), 18 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 75635eecd1..5cd16ea4d0 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -151,12 +151,91 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: @celery.task(base=AugurCoreRepoCollectionTask) -def collect_issue_comments(issue_ids, repo_git) -> int: +def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + + if comments: + logger.info(f"Length of comments: {len(comments)}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue comments") + + +def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): + + owner, repo = get_owner_repo(repo_git) + + all_comments = [] + issue_count = len(issue_ids) + index = 1 + for id in issue_ids: + + print(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + comments = GitlabPaginator(url, key_auth, logger) + + for page_data, page in comments.iter_pages(): + + if page_data is None or len(page_data) == 0: + break + + all_comments += page_data + + index += 1 + + return all_comments - print(f"Collect issue comments. Repo git: {repo_git}. Len ids: {issue_ids}") @celery.task(base=AugurCoreRepoCollectionTask) -def collect_issue_events(issue_ids, repo_git) -> int: +def collect_gitlab_issue_events(repo_git) -> int: + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + events = retrieve_all_gitlab_issue_event_data(repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of events: {len(events)}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue events") + + + +def retrieve_all_gitlab_issue_event_data(repo_git, logger, key_auth) -> None: + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issue events for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=issue&per_page=100" + events = GitlabPaginator(url, key_auth, logger) + + all_data = [] + num_pages = events.get_num_pages() + for page_data, page in events.iter_pages(): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab Issue Events Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: Issue Events Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab Issue Events Page {page} of {num_pages}") + + all_data += page_data - print(f"Collect issue events. Repo git: {repo_git}. Len ids: {issue_ids}") \ No newline at end of file + return all_data \ No newline at end of file diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index ae3b95e41f..e0875b4194 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -134,36 +134,36 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_events(mr_ids, repo_git) -> int: - - print("Collect merge request events") + pass + #print("Collect merge request events") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_metadata(mr_ids, repo_git) -> int: - - print("Collect merge request metadata") + pass + #print("Collect merge request metadata") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: - - print("Collect merge request reviewers") + pass + #print("Collect merge request reviewers") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: - - print("Collect merge request commits") + pass + #print("Collect merge request commits") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: - - print("Collect merge request files") + pass + #print("Collect merge request files") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 86216c94ee..dca7f3518b 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -25,7 +25,7 @@ from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_events, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files -from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_issue_comments, collect_issue_events +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments, collect_gitlab_issue_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * @@ -109,9 +109,9 @@ def primary_repo_collect_phase_gitlab(repo_git): collect_merge_request_files.s(repo_git) )), chain(collect_gitlab_issues.si(repo_git), group( - collect_issue_comments.s(repo_git), - collect_issue_events.s(repo_git) - )) + collect_gitlab_issue_comments.s(repo_git) + )), + collect_gitlab_issue_events.si(repo_git) ) return jobs From ca6544d3534bb8ef76e3e62fc0a6faf6f92eaa26 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 8 Dec 2023 17:01:30 -0600 Subject: [PATCH 07/25] Setup urls for merge request collections Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 2 +- augur/tasks/gitlab/merge_request_task.py | 36 +++++++++++++++++++----- augur/tasks/start_tasks.py | 4 +-- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 5cd16ea4d0..6504615f8b 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -218,7 +218,7 @@ def retrieve_all_gitlab_issue_event_data(repo_git, logger, key_auth) -> None: logger.info(f"Collecting gitlab issue events for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=issue&per_page=100" + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=issue" events = GitlabPaginator(url, key_auth, logger) all_data = [] diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index e0875b4194..b1ecbb481b 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -128,41 +128,63 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_comments(mr_ids, repo_git) -> int: - print("Collect merge request comments") + owner, repo = get_owner_repo(repo_git) + id = mr_ids[0] + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes" # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) -def collect_merge_request_events(mr_ids, repo_git) -> int: - pass +def collect_merge_request_events(repo_git) -> int: + + owner, repo = get_owner_repo(repo_git) + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=merge_request" #print("Collect merge request events") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_metadata(mr_ids, repo_git) -> int: - pass + + owner, repo = get_owner_repo(repo_git) + id = mr_ids[0] + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}" #print("Collect merge request metadata") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: - pass + + owner, repo = get_owner_repo(repo_git) + id = mr_ids[0] + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals" #print("Collect merge request reviewers") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: - pass + + owner, repo = get_owner_repo(repo_git) + id = mr_ids[0] + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits" #print("Collect merge request commits") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: - pass + + owner, repo = get_owner_repo(repo_git) + id = mr_ids[0] + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/cahnges" #print("Collect merge request files") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index dca7f3518b..ae819f2030 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -102,7 +102,6 @@ def primary_repo_collect_phase_gitlab(repo_git): jobs = group( chain(collect_gitlab_merge_requests.si(repo_git), group( collect_merge_request_comments.s(repo_git), - collect_merge_request_events.s(repo_git), collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), @@ -111,7 +110,8 @@ def primary_repo_collect_phase_gitlab(repo_git): chain(collect_gitlab_issues.si(repo_git), group( collect_gitlab_issue_comments.s(repo_git) )), - collect_gitlab_issue_events.si(repo_git) + collect_gitlab_issue_events.si(repo_git), + collect_merge_request_events.si(repo_git), ) return jobs From daa4106fdd9dc4b2fc63255d4f210bb8ce728b51 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 10:13:46 -0600 Subject: [PATCH 08/25] Refeactor gitlab paginator Signed-off-by: Andrew Brain --- augur/tasks/github/util/github_paginator.py | 2 + ...lab_paginator.py => gitlab_api_handler.py} | 347 +++++------------- augur/tasks/gitlab/issues_task.py | 18 +- augur/tasks/gitlab/merge_request_task.py | 140 +++++-- 4 files changed, 212 insertions(+), 295 deletions(-) rename augur/tasks/gitlab/{gitlab_paginator.py => gitlab_api_handler.py} (56%) diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 548d25b0f9..31c14565df 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -154,6 +154,8 @@ class GithubApiResult(Enum): SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected BAD_CREDENTIALS = 7 HTML = 8 EMPTY_STRING = 9 diff --git a/augur/tasks/gitlab/gitlab_paginator.py b/augur/tasks/gitlab/gitlab_api_handler.py similarity index 56% rename from augur/tasks/gitlab/gitlab_paginator.py rename to augur/tasks/gitlab/gitlab_api_handler.py index e7dd36b9e5..97b1690006 100644 --- a/augur/tasks/gitlab/gitlab_paginator.py +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -1,25 +1,18 @@ -import collections import httpx import time -import json -import asyncio -import datetime import logging - from typing import List, Optional, Union, Generator, Tuple from urllib.parse import urlencode, urlparse, parse_qs, urlunparse from enum import Enum - from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.tasks.github.util.util import parse_json_response class GitlabApiResult(Enum): """All the different results of querying the Gitlab API.""" - NEW_RESULT = -1 SUCCESS = 0 TIMEOUT = 1 NO_MORE_ATTEMPTS = 2 @@ -27,77 +20,12 @@ class GitlabApiResult(Enum): SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected BAD_CREDENTIALS = 7 - HTML = 8 - EMPTY_STRING = 9 - -def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: - """Ping the api and get the data back for the page. - - Returns: - A httpx response that contains the data. None if a timeout occurs - """ - # self.logger.info(f"Hitting endpoint with {method} request: {url}...\n") - - with httpx.Client() as client: - - try: - response = client.request( - method=method, url=url, auth=key_manager, timeout=timeout, follow_redirects=True) - - except TimeoutError: - logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") - time.sleep(round(timeout)) - return None - except httpx.TimeoutException: - logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") - time.sleep(round(timeout)) - return None - except httpx.NetworkError: - logger.info(f"Network Error. Sleeping {round(timeout)} seconds and trying again...\n") - time.sleep(round(timeout)) - return None - except httpx.ProtocolError: - logger.info(f"Protocol Error. Sleeping {round(timeout*1.5)} seconds and trying again...\n") - time.sleep(round(timeout*1.5)) - return None - - return response - - -def process_dict_response(logger: logging.Logger, response: httpx.Response, page_data: dict) -> Optional[str]: - """Process dict response from the api and return the status. - - Args: - logger: handles logging - response: used to access the url of the request and the headers - page_data: dict response from the api - - Returns: - A string explaining what happened is returned if what happened is determined, otherwise None is returned. - """ - - status_code = response.status_code - if status_code == 429: - current_epoch = int(time.time()) - epoch_when_key_resets = int(response.headers["X-RateLimit-Reset"]) - key_reset_time = epoch_when_key_resets - current_epoch - - if key_reset_time < 0: - logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") - key_reset_time = 0 - - logger.info(f"\n\n\nGitlab API rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") - time.sleep(key_reset_time) - - return GitlabApiResult.RATE_LIMIT_EXCEEDED - - - return GitlabApiResult.NEW_RESULT - -class GitlabPaginator(collections.abc.Sequence): - """This class is a sequence that handles paginating through data on the Gitlab API. +class GitlabApiHandler(): + """This class is a sequence that handles retrieving data from the Gitlab API. Attributes: url (str): The url that we are collecting data @@ -107,7 +35,7 @@ class GitlabPaginator(collections.abc.Sequence): logger (logging.Logger): Logger that handler printing information to files and stdout """ - def __init__(self, url: str, key_manager: GitlabRandomKeyAuth, logger: logging.Logger, from_datetime=None, to_datetime=None): + def __init__(self, key_manager: GitlabRandomKeyAuth, logger: logging.Logger): """Initialize the class GitlabPaginator. Args: @@ -117,57 +45,10 @@ def __init__(self, url: str, key_manager: GitlabRandomKeyAuth, logger: logging.L from_datetime: collects data after this datatime (not yet implemented) to_datetime: collects data before this datatime (not yet implemented) """ - remove_fields = ["per_page", "page"] - url = clean_url(url, remove_fields) - - # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request - # this is because github will only append specified params to the links in the headers if they are a part - # of the url, and not the params with the request - params = {"per_page": 100} - url = add_query_params(url, params) - - self.url = url self.key_manager = key_manager self.logger = logger - # get the logger from the key manager - # self.logger = key_manager.logger - - self.from_datetime = from_datetime - self.to_datetime = to_datetime - - def __getitem__(self, index: int) -> Optional[dict]: - """Get the value at index of the Gitlab API data returned from the url. - - Args: - index: The index of the desired data from the Gitlab API - - Returns: - The value at the index - """ - - # get the page the item is on - items_page = (index // 100) + 1 - - # create url to query - params = {"page": items_page} - url = add_query_params(self.url, params) - - data, _, result = self.retrieve_data(url) - - if result != GitlabApiResult.SUCCESS: - self.logger.debug("Unable to get item from the api") - return None - - # get the position of data on the page - page_index = index % 100 - - try: - return data[page_index] - except KeyError as e: - raise KeyError("Data does not exists for that index") from e - - def __len__(self): + def get_length(self, url): """Get the length of the Gitlab API data. Returns: @@ -185,7 +66,7 @@ def __len__(self): self.logger.info(f"Num pages: {num_pages}") params = {"page": num_pages} - url = add_query_params(self.url, params) + url = add_query_params(url, params) # get the amount of data on last page data, _, result = self.retrieve_data(url) @@ -196,13 +77,16 @@ def __len__(self): self.logger.debug("Unable to retrieve data length from api") return 0 - def __iter__(self) -> Generator[Optional[dict], None, None]: + def iter(self, url) -> Generator[Optional[dict], None, None]: """Provide data from Gitlab API via a generator that yields one dict at a time. Yields: A piece of data from the github api as the specified url """ - data_list, response, result = self.retrieve_data(self.url) + + url = self._set_paginaton_query_params(url) + + data_list, response, result = self.retrieve_data(url) if result != GitlabApiResult.SUCCESS: self.logger.debug("Failed to retrieve the data even though 10 attempts were given") @@ -226,14 +110,18 @@ def __iter__(self) -> Generator[Optional[dict], None, None]: for data in data_list: yield data - def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: + def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, None]: """Provide data from Gitlab API via a generator that yields a page of dicts at a time. Returns: A page of data from the Gitlab API at the specified url """ + + url = self._set_paginaton_query_params(url) + print(f"Iter pages url: {url}") + # retrieves the data for the given url - data_list, response, result = self.retrieve_data(self.url) + data_list, response, result = self.retrieve_data(url) if result != GitlabApiResult.SUCCESS: self.logger.debug("Failed to retrieve the data even though 10 attempts were given") @@ -241,7 +129,8 @@ def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: return # this retrieves the page for the given url - page_number = get_url_page_number(self.url) + page_number = get_url_page_number(url) + print(f"iter pages first page number: {page_number}") # yields the first page of data and its page number yield data_list, page_number @@ -250,6 +139,7 @@ def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: # gets the next page from the last responses header next_page = response.links['next']['url'] + print(f"next page url: {next_page}") # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values data_list, response, result = self.retrieve_data(next_page) @@ -260,6 +150,8 @@ def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: page_number = get_url_page_number(next_page) + print(f"iter pages page number: {page_number}") + # if either the data or response is None then yield None and return if data_list is None or response is None: return @@ -276,6 +168,7 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. Returns The response object from hitting the url and the data on the page """ + timeout = 30 timeout_count = 0 num_attempts = 1 @@ -292,57 +185,51 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. num_attempts += 1 continue - # if api returns a status of 204 No Content then return empty list + if response.status_code == 500: + self.logger.error(f"Gitlab returned {response.status_code} error when fetching {url}. Message: {response.json()}") + continue + + if response.status_code == 429: + + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["ratelimit-reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nGitlab API rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + continue + if response.status_code == 204: return [], response, GitlabApiResult.SUCCESS - - page_data = parse_json_response(self.logger, response) - + if response.status_code >= 200 and response.status_code <=299: - # if the data is a list, then return it and the response - if isinstance(page_data, list) is True: + page_data = parse_json_response(self.logger, response) return page_data, response, GitlabApiResult.SUCCESS - - # if the data is a dict then call process_dict_response, and - if isinstance(page_data, dict) is True: - dict_processing_result = process_dict_response(self.logger, response, page_data) - - if dict_processing_result == GitlabApiResult.NEW_RESULT: - print(f"Encountered new dict response from api on url: {url}. Response: {page_data}") - return None, None, GitlabApiResult.NEW_RESULT - - if dict_processing_result == GitlabApiResult.REPO_NOT_FOUND: - return None, response, GitlabApiResult.REPO_NOT_FOUND - - if dict_processing_result in (GitlabApiResult.SECONDARY_RATE_LIMIT, GitlabApiResult.ABUSE_MECHANISM_TRIGGERED): - continue - - if dict_processing_result == GitlabApiResult.RATE_LIMIT_EXCEEDED: - num_attempts = 0 - continue - - if isinstance(page_data, str) is True: - str_processing_result: Union[str, List[dict]] = self.process_str_response(page_data) - - if isinstance(str_processing_result, list): - return str_processing_result, response, GitlabApiResult.SUCCESS - - num_attempts += 1 + + self.logger.warning(f"Unhandled gitlab response. Status code: {response.status_code}. Body: {response.json()}") + self.logger.error("Unable to collect data in 10 attempts") return None, None, GitlabApiResult.NO_MORE_ATTEMPTS - def get_num_pages(self) -> Optional[int]: + def get_num_pages(self, url) -> Optional[int]: """Get the number of pages of data that a url can paginate through. Returns: The number of pages a url can access """ + + url = self._set_paginaton_query_params(url) + timeout: float = 5 num_attempts = 0 while num_attempts < 10: - r = hit_api(self.key_manager, self.url, self.logger, timeout=timeout, method="HEAD") + r = self.hit_api(url=url, timeout=timeout, method="HEAD") if r: break @@ -365,39 +252,22 @@ def get_num_pages(self) -> Optional[int]: return num_pages - def hit_api(self, url, timeout): - - return hit_api(self.key_manager, url, self.logger, timeout) - - -################################################### + def hit_api(self, url, timeout, method): - def process_str_response(self, page_data: str) -> Union[str, List[dict]]: - """Process an api response of type string. + return hit_api(self.key_manager, url, self.logger, timeout, method=method) - Args: - page_data: the string response from the api that is being processed - - Returns: - html_response, empty_string, and failed_to_parse_jsonif the data is not processable. - Or a list of dicts if the json was parasable - """ - self.logger.info(f"Warning! page_data was string: {page_data}\n") - - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - return GitlabApiResult.HTML + def _set_paginaton_query_params(self, url): - if not page_data: - self.logger.info("Empty string, trying again...\n") - return GitlabApiResult.EMPTY_STRING + remove_fields = ["per_page", "page"] + url = clean_url(url, remove_fields) - try: - list_of_dict_page_data = json.loads(page_data) - return list_of_dict_page_data - except TypeError: - return "failed_to_parse_json" + # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request + # this is because github will only append specified params to the links in the headers if they are a part + # of the url, and not the params with the request + params = {"per_page": 100} + url = add_query_params(url, params) + return url ################################################################################ @@ -443,10 +313,6 @@ def add_query_params(url: str, additional_params: dict) -> str: return url_components._replace(query=updated_query).geturl() - -################################################################################ - - def get_url_page_number(url: str) -> int: """Parse the page number from the url. @@ -469,68 +335,37 @@ def get_url_page_number(url: str) -> int: return page_number +################################################################################ -def retrieve_dict_from_endpoint(logger, key_auth, url, timeout_wait=10) -> Tuple[Optional[dict], GitlabApiResult]: - timeout = timeout_wait - timeout_count = 0 - num_attempts = 1 - - while num_attempts <= 10: - - response = hit_api(key_auth, url, logger, timeout) - - if response is None: - if timeout_count == 10: - logger.error(f"Request timed out 10 times for {url}") - return None, GitlabApiResult.TIMEOUT - - timeout = timeout * 1.1 - num_attempts += 1 - continue - - - page_data = parse_json_response(logger, response) - - if isinstance(page_data, str): - str_processing_result: Union[str, List[dict]] = process_str_response(logger,page_data) - - if isinstance(str_processing_result, dict): - #return str_processing_result, response, GitlabApiResult.SUCCESS - page_data = str_processing_result - else: - num_attempts += 1 - continue - - # if the data is a list, then return it and the response - if isinstance(page_data, list): - logger.warning("Wrong type returned, trying again...") - logger.info(f"Returned list: {page_data}") - - # if the data is a dict then call process_dict_response, and - elif isinstance(page_data, dict): - dict_processing_result = process_dict_response(logger, response, page_data) - - if dict_processing_result == GitlabApiResult.SUCCESS: - return page_data, dict_processing_result - if dict_processing_result == GitlabApiResult.NEW_RESULT: - logger.info(f"Encountered new dict response from api on url: {url}. Response: {page_data}") - return None, GitlabApiResult.NEW_RESULT - - if dict_processing_result == GitlabApiResult.REPO_NOT_FOUND: - return None, GitlabApiResult.REPO_NOT_FOUND - - if dict_processing_result in (GitlabApiResult.SECONDARY_RATE_LIMIT, GitlabApiResult.ABUSE_MECHANISM_TRIGGERED): - continue - - if dict_processing_result == GitlabApiResult.RATE_LIMIT_EXCEEDED: - num_attempts = 0 - continue +def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: + """Ping the api and get the data back for the page. - + Returns: + A httpx response that contains the data. None if a timeout occurs + """ + # self.logger.info(f"Hitting endpoint with {method} request: {url}...\n") - num_attempts += 1 + with httpx.Client() as client: - logger.error("Unable to collect data in 10 attempts") - return None, GitlabApiResult.NO_MORE_ATTEMPTS + try: + response = client.request( + method=method, url=url, auth=key_manager, timeout=timeout, follow_redirects=True) + except TimeoutError: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.TimeoutException: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.NetworkError: + logger.info(f"Network Error. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.ProtocolError: + logger.info(f"Protocol Error. Sleeping {round(timeout*1.5)} seconds and trying again...\n") + time.sleep(round(timeout*1.5)) + return None + return response \ No newline at end of file diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 6504615f8b..fa4427f79c 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -3,7 +3,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.gitlab.gitlab_paginator import GitlabPaginator +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts @@ -50,11 +50,11 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: logger.info(f"Collecting gitlab issues for {owner}/{repo}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" - issues = GitlabPaginator(url, key_auth, logger) + issues = GitlabApiHandler(key_auth, logger) all_data = [] - num_pages = issues.get_num_pages() - for page_data, page in issues.iter_pages(): + num_pages = issues.get_num_pages(url) + for page_data, page in issues.iter_pages(url): if page_data is None: return all_data @@ -179,9 +179,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): print(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" - comments = GitlabPaginator(url, key_auth, logger) + comments = GitlabApiHandler(key_auth, logger) - for page_data, page in comments.iter_pages(): + for page_data, page in comments.iter_pages(url): if page_data is None or len(page_data) == 0: break @@ -219,11 +219,11 @@ def retrieve_all_gitlab_issue_event_data(repo_git, logger, key_auth) -> None: logger.info(f"Collecting gitlab issue events for {owner}/{repo}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=issue" - events = GitlabPaginator(url, key_auth, logger) + events = GitlabApiHandler(key_auth, logger) all_data = [] - num_pages = events.get_num_pages() - for page_data, page in events.iter_pages(): + num_pages = events.get_num_pages(url) + for page_data, page in events.iter_pages(url): if page_data is None: return all_data diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index b1ecbb481b..eda51ea61f 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -2,7 +2,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.gitlab.gitlab_paginator import GitlabPaginator +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts @@ -41,11 +41,11 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: logger.info(f"Collecting pull requests for {owner}/{repo}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" - mrs = GitlabPaginator(url, key_auth, logger) + mrs = GitlabApiHandler(key_auth, logger) all_data = [] - num_pages = mrs.get_num_pages() - for page_data, page in mrs.iter_pages(): + num_pages = mrs.get_num_pages(url) + for page_data, page in mrs.iter_pages(url): if page_data is None: return all_data @@ -53,7 +53,7 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: if len(page_data) == 0: logger.debug( f"{owner}/{repo} Mrs Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") return all_data logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") @@ -129,10 +129,20 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): def collect_merge_request_comments(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) - id = mr_ids[0] - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes" - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger) + + if comments: + logger.info(f"Length of merge request comments: {len(comments)}") + logger.info(f"Mr comment: {comments[0]}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request comments") + @celery.task(base=AugurCoreRepoCollectionTask) @@ -147,46 +157,116 @@ def collect_merge_request_events(repo_git) -> int: @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_metadata(mr_ids, repo_git) -> int: - - owner, repo = get_owner_repo(repo_git) - id = mr_ids[0] - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}" - #print("Collect merge request metadata") - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + pass + + # TODO: Figure out how to handle new response it gitlab paginator + # owner, repo = get_owner_repo(repo_git) + + # logger = logging.getLogger(collect_merge_request_metadata.__name__) + # with GitlabTaskManifest(logger) as manifest: + + # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + # metadata = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger) + + # if metadata: + # logger.info(f"Length of merge request metadata: {len(metadata)}") + # logger.info(f"Mr metadata: {metadata[0]}") + # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + # else: + # logger.info(f"{owner}/{repo} has no gitlab merge request metadata") + @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: - - owner, repo = get_owner_repo(repo_git) - id = mr_ids[0] - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals" + pass + + # TODO: Figure out how to handle new response it gitlab paginator + # owner, repo = get_owner_repo(repo_git) + + # logger = logging.getLogger(collect_merge_request_reviewers.__name__) + # with GitlabTaskManifest(logger) as manifest: + + # url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + # reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger) + + # if reviewers: + # logger.info(f"Length of merge request reviewers: {len(reviewers)}") + # logger.info(f"Mr reviewer: {reviewers[0]}") + # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + # else: + # logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") + #print("Collect merge request reviewers") # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: - - owner, repo = get_owner_repo(repo_git) - id = mr_ids[0] - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits" - #print("Collect merge request commits") - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") + pass + + # TODO: Figure out how to handle new response it gitlab paginator + # owner, repo = get_owner_repo(repo_git) + + # logger = logging.getLogger(collect_merge_request_comments.__name__) + # with GitlabTaskManifest(logger) as manifest: + + # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + # commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger) + + # if commits: + # logger.info(f"Length of merge request commits: {len(commits)}") + # logger.info(f"Mr commit: {commits[0]}") + # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + # else: + # logger.info(f"{owner}/{repo} has no gitlab merge request commits") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: + + pass + + # TODO: Figure out how to handle new response it gitlab paginator + # owner, repo = get_owner_repo(repo_git) + + # logger = logging.getLogger(collect_merge_request_comments.__name__) + # with GitlabTaskManifest(logger) as manifest: + + # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + # files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger) + + # if files: + # logger.info(f"Length of merge request files: {len(files)}") + # logger.info(f"Mr comment: {files[0]}") + # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + # else: + # logger.info(f"{owner}/{repo} has no gitlab merge request files") - owner, repo = get_owner_repo(repo_git) - id = mr_ids[0] - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/cahnges" - #print("Collect merge request files") - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") +def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger): + + all_data = [] + issue_count = len(ids) + index = 1 + + data = GitlabApiHandler(key_auth, logger) + for id in ids: + if len(all_data) > 40: + return all_data + + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {issue_count}") + for page_data, _ in data.iter_pages(url): + + if page_data is None or len(page_data) == 0: + break - \ No newline at end of file + all_data += page_data + + index += 1 + + return all_data From 4d5b4b2098e9ab2f6d1e9170543c33602aa3bce0 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 11:06:06 -0600 Subject: [PATCH 09/25] Add collection for all the mr data Signed-off-by: Andrew Brain --- augur/tasks/gitlab/gitlab_api_handler.py | 13 +-- augur/tasks/gitlab/merge_request_task.py | 133 +++++++++++------------ 2 files changed, 71 insertions(+), 75 deletions(-) diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py index 97b1690006..3a463127bb 100644 --- a/augur/tasks/gitlab/gitlab_api_handler.py +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -16,7 +16,7 @@ class GitlabApiResult(Enum): SUCCESS = 0 TIMEOUT = 1 NO_MORE_ATTEMPTS = 2 - REPO_NOT_FOUND = 3 + NOT_FOUND = 3 SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 @@ -118,7 +118,6 @@ def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, N """ url = self._set_paginaton_query_params(url) - print(f"Iter pages url: {url}") # retrieves the data for the given url data_list, response, result = self.retrieve_data(url) @@ -130,7 +129,6 @@ def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, N # this retrieves the page for the given url page_number = get_url_page_number(url) - print(f"iter pages first page number: {page_number}") # yields the first page of data and its page number yield data_list, page_number @@ -139,7 +137,6 @@ def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, N # gets the next page from the last responses header next_page = response.links['next']['url'] - print(f"next page url: {next_page}") # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values data_list, response, result = self.retrieve_data(next_page) @@ -150,8 +147,6 @@ def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, N page_number = get_url_page_number(next_page) - print(f"iter pages page number: {page_number}") - # if either the data or response is None then yield None and return if data_list is None or response is None: return @@ -203,9 +198,13 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. time.sleep(key_reset_time) continue + if response.status_code == 404: + self.logger.info(f"ERROR: 404 not found for {url}") + return [], response, GitlabApiResult.NOT_FOUND + if response.status_code == 204: return [], response, GitlabApiResult.SUCCESS - + if response.status_code >= 200 and response.status_code <=299: page_data = parse_json_response(self.logger, response) diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index eda51ea61f..b3dd5151dd 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -134,7 +134,7 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") - comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger) + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") if comments: logger.info(f"Length of merge request comments: {len(comments)}") @@ -158,114 +158,111 @@ def collect_merge_request_events(repo_git) -> int: @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_metadata(mr_ids, repo_git) -> int: - pass - - # TODO: Figure out how to handle new response it gitlab paginator - # owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - # logger = logging.getLogger(collect_merge_request_metadata.__name__) - # with GitlabTaskManifest(logger) as manifest: + logger = logging.getLogger(collect_merge_request_metadata.__name__) + with GitlabTaskManifest(logger) as manifest: - # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") - # metadata = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger) + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") - # if metadata: - # logger.info(f"Length of merge request metadata: {len(metadata)}") - # logger.info(f"Mr metadata: {metadata[0]}") - # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - # else: - # logger.info(f"{owner}/{repo} has no gitlab merge request metadata") + if metadata_list: + logger.info(f"Length of merge request metadata: {len(metadata_list)}") + logger.info(f"Mr metadata: {metadata_list[0]}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request metadata") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: - pass - - # TODO: Figure out how to handle new response it gitlab paginator - # owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - # logger = logging.getLogger(collect_merge_request_reviewers.__name__) - # with GitlabTaskManifest(logger) as manifest: + logger = logging.getLogger(collect_merge_request_reviewers.__name__) + with GitlabTaskManifest(logger) as manifest: - # url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") - # reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger) + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") - # if reviewers: - # logger.info(f"Length of merge request reviewers: {len(reviewers)}") - # logger.info(f"Mr reviewer: {reviewers[0]}") - # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - # else: - # logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") + if reviewers: + logger.info(f"Length of merge request reviewers: {len(reviewers)}") + logger.info(f"Mr reviewer: {reviewers[0]}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") - #print("Collect merge request reviewers") - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") - @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: - pass - - # TODO: Figure out how to handle new response it gitlab paginator - # owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - # logger = logging.getLogger(collect_merge_request_comments.__name__) - # with GitlabTaskManifest(logger) as manifest: + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: - # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") - # commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger) + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") - # if commits: - # logger.info(f"Length of merge request commits: {len(commits)}") - # logger.info(f"Mr commit: {commits[0]}") - # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - # else: - # logger.info(f"{owner}/{repo} has no gitlab merge request commits") + if commits: + logger.info(f"Length of merge request commits: {len(commits)}") + logger.info(f"Mr commit: {commits[0]}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request commits") @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: - pass - - # TODO: Figure out how to handle new response it gitlab paginator - # owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - # logger = logging.getLogger(collect_merge_request_comments.__name__) - # with GitlabTaskManifest(logger) as manifest: + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: - # url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") - # files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger) + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") - # if files: - # logger.info(f"Length of merge request files: {len(files)}") - # logger.info(f"Mr comment: {files[0]}") - # #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - # else: - # logger.info(f"{owner}/{repo} has no gitlab merge request files") + if files: + logger.info(f"Length of merge request files: {len(files)}") + logger.info(f"Mr file: {files[0]}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request files") -def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger): +def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): all_data = [] issue_count = len(ids) index = 1 - data = GitlabApiHandler(key_auth, logger) + api_handler = GitlabApiHandler(key_auth, logger) for id in ids: - if len(all_data) > 40: + if len(all_data) > 10: return all_data - + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {issue_count}") - for page_data, _ in data.iter_pages(url): + formatted_url = url.format(id=id) + + if response_type == "dict": + page_data, _, _ = api_handler.retrieve_data(formatted_url) + if page_data: + all_data.append(page_data) + + elif response_type == "list": - if page_data is None or len(page_data) == 0: - break + for page_data, _ in api_handler.iter_pages(formatted_url): - all_data += page_data + if page_data is None or len(page_data) == 0: + break + + all_data += page_data + + else: + raise Exception(f"Unexpected reponse type: {response_type}") index += 1 From 9793d4455669b7fcce081a50522048b34d8c8351 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 11:12:37 -0600 Subject: [PATCH 10/25] Add merge request events collection Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 45 ------------------------ augur/tasks/gitlab/merge_request_task.py | 8 ----- augur/tasks/init/celery_app.py | 3 +- augur/tasks/start_tasks.py | 3 +- 4 files changed, 4 insertions(+), 55 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index fa4427f79c..4f035a6024 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -194,48 +194,3 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): -@celery.task(base=AugurCoreRepoCollectionTask) -def collect_gitlab_issue_events(repo_git) -> int: - - owner, repo = get_owner_repo(repo_git) - - logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - - events = retrieve_all_gitlab_issue_event_data(repo_git, logger, manifest.key_auth) - - if events: - logger.info(f"Length of events: {len(events)}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) - else: - logger.info(f"{owner}/{repo} has no gitlab issue events") - - - -def retrieve_all_gitlab_issue_event_data(repo_git, logger, key_auth) -> None: - - owner, repo = get_owner_repo(repo_git) - - logger.info(f"Collecting gitlab issue events for {owner}/{repo}") - - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=issue" - events = GitlabApiHandler(key_auth, logger) - - all_data = [] - num_pages = events.get_num_pages(url) - for page_data, page in events.iter_pages(url): - - if page_data is None: - return all_data - - if len(page_data) == 0: - logger.debug( - f"{owner}/{repo}: Gitlab Issue Events Page {page} contains no data...returning") - logger.info(f"{owner}/{repo}: Issue Events Page {page} of {num_pages}") - return all_data - - logger.info(f"{owner}/{repo}: Gitlab Issue Events Page {page} of {num_pages}") - - all_data += page_data - - return all_data \ No newline at end of file diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index b3dd5151dd..bf9e55be4d 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -145,14 +145,6 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: -@celery.task(base=AugurCoreRepoCollectionTask) -def collect_merge_request_events(repo_git) -> int: - - owner, repo = get_owner_repo(repo_git) - - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type=merge_request" - #print("Collect merge request events") - # print(f"Repo git: {repo_git}. Len ids: {mr_ids}") @celery.task(base=AugurCoreRepoCollectionTask) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index a2b06a22a6..ee6eaeccdf 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -51,7 +51,8 @@ class CollectionState(Enum): 'augur.tasks.github.traffic.tasks'] gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', - 'augur.tasks.gitlab.issues_task'] + 'augur.tasks.gitlab.issues_task', + 'augur.tasks.gitlab.events_task'] git_tasks = ['augur.tasks.git.facade_tasks', 'augur.tasks.git.dependency_tasks.tasks', diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ae819f2030..b9cb5dd8b6 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -25,7 +25,8 @@ from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_events, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files -from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments, collect_gitlab_issue_events +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments +from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * From 29c31d5a8012ee50effc35c1719f82c90359e1ff Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 11:18:49 -0600 Subject: [PATCH 11/25] Add gitlab event data mappers Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 6cc94494ae..40733923e8 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -691,3 +691,47 @@ def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_ +def extract_gitlab_mr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + + mr_event = { + 'pull_request_id': pr_id, + 'cntrb_id': None, + 'action': event['action_name'], + 'action_commit_hash': None, + 'created_at': event['created_at'], + 'issue_event_src_id': event['target_id'], + 'repo_id': repo_id, + 'platform_id': platform_id, + 'node_id': None, + 'node_url': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return mr_event + +def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + + issue_event = { + "issue_event_src_id": event['target_id'], + "issue_id": issue_id, + "node_id": None, + "node_url": None, + "cntrb_id": None, + "created_at": event['created_at'], + "action": event["action_name"], + "action_commit_hash": None, + "platform_id": platform_id, + "repo_id" : repo_id, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_event + + + + + From 0d0efb410385d3673474abdf4f87c1536f4eb018 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 11:37:51 -0600 Subject: [PATCH 12/25] Add issue event db inserts Signed-off-by: Andrew Brain --- augur/tasks/gitlab/events_task.py | 117 ++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 augur/tasks/gitlab/events_task.py diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py new file mode 100644 index 0000000000..a93bb4edbc --- /dev/null +++ b/augur/tasks/gitlab/events_task.py @@ -0,0 +1,117 @@ +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_events(repo_git) -> int: + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab issue events: {len(events)}") + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue events") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_request_events(repo_git) -> int: + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab merge request events: {len(events)}") + #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request events") + + +def retrieve_all_gitlab_event_data(type, repo_git, logger, key_auth) -> None: + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issue events for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={type}" + events = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = events.get_num_pages(url) + for page_data, page in events.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab {type} Events Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: {type} Events Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab {type} Events Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issue_events(events, task_name, repo_id, logger, augur_db): + + tool_source = "Gitlab events task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_event_dicts = [] + + # create mapping from issue number to issue id of current issues + issue_url_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id + + for event in events: + + issue_number = event["target_iid"] + + try: + issue_id = issue_url_to_id_map[issue_number] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for an issue with number {issue_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + issue_event_dicts.append( + extract_gitlab_issue_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") + issue_event_natural_keys = ["issue_id", "issue_event_src_id"] + augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + + From 0e1dde5956f13abf946193074729d4073f0db19c Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 11:49:12 -0600 Subject: [PATCH 13/25] Add mr event processing Signed-off-by: Andrew Brain --- augur/tasks/gitlab/events_task.py | 49 +++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index a93bb4edbc..4224988b9f 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -6,7 +6,7 @@ from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest +from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent from augur.application.db.util import execute_session_query platform_id = 2 @@ -42,11 +42,17 @@ def collect_gitlab_merge_request_events(repo_git) -> int: logger = logging.getLogger(collect_gitlab_issue_events.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) if events: logger.info(f"Length of gitlab merge request events: {len(events)}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request events") @@ -81,7 +87,7 @@ def retrieve_all_gitlab_event_data(type, repo_git, logger, key_auth) -> None: def process_issue_events(events, task_name, repo_id, logger, augur_db): - tool_source = "Gitlab events task" + tool_source = "Gitlab issue events task" tool_version = "2.0" data_source = "Gitlab API" @@ -115,3 +121,40 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) +def process_mr_events(events, task_name, repo_id, logger, augur_db): + + tool_source = "Gitlab mr events task" + tool_version = "2.0" + data_source = "Gitlab API" + + mr_event_dicts = [] + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + for event in events: + + mr_number = event["target_iid"] + + try: + issue_id = mr_number_to_id_map[mr_number] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for an mr with number {mr_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + mr_event_dicts.append( + extract_gitlab_mr_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + # TODO: Add unique key for this + logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") + mr_event_natural_keys = ["pull_request_id", "issue_event_src_id"] + augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + + From dbe280ae985eadd129c98ac389e195c1b34ca7ab Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 12:26:18 -0600 Subject: [PATCH 14/25] Start on mr reviewers Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 23 ++++++++++++++++ augur/tasks/gitlab/merge_request_task.py | 35 +++++++++++++++++++----- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 40733923e8..9b764770f2 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -732,6 +732,29 @@ def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int return issue_event +# retrieve only the needed data for pr reviewers from the api response +def extract_needed_pr_reviewer_data(reviewers: List[dict], pull_request_id, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + if len(reviewers) == 0: + return [] + + reviewer_dicts = [] + for reviewer in reviewers: + + reviewer_dict = { + 'pull_request_id': pull_request_id, + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + reviewer_dicts.append(reviewer_dict) + + return reviewer_dicts + + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index bf9e55be4d..be97382eb9 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,7 +4,7 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, Repo @@ -180,11 +180,30 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - logger.info(f"Mr reviewer: {reviewers[0]}") #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") - + +def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr Reviewr Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + reviewers = extract_needed_pr_reviewer_data(values, pull_request_id, repo_id, tool_source, tool_version, data_source) + + for review in reviewers: + print(review["pull_request_id"]) @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: @@ -226,7 +245,7 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): - all_data = [] + all_data = {} issue_count = len(ids) index = 1 @@ -242,7 +261,7 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r if response_type == "dict": page_data, _, _ = api_handler.retrieve_data(formatted_url) if page_data: - all_data.append(page_data) + all_data[id] = page_data elif response_type == "list": @@ -251,8 +270,10 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r if page_data is None or len(page_data) == 0: break - all_data += page_data - + if id in all_data: + all_data[id].extend(page_data) + else: + all_data[id] = page_data else: raise Exception(f"Unexpected reponse type: {response_type}") From b1a2ea6d1735b7b013b40fa25c7481fe1ce66d50 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 12:49:47 -0600 Subject: [PATCH 15/25] Add start of mr reviewer processing Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 26 +++++++++++++----------- augur/tasks/gitlab/merge_request_task.py | 21 +++++++++++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 9b764770f2..6e0e2ea840 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -733,23 +733,25 @@ def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int # retrieve only the needed data for pr reviewers from the api response -def extract_needed_pr_reviewer_data(reviewers: List[dict], pull_request_id, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: +def extract_needed_pr_reviewer_data(data: List[dict], pull_request_id, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: - if len(reviewers) == 0: + if len(data) == 0: return [] - + reviewer_dicts = [] - for reviewer in reviewers: + for x in data: - reviewer_dict = { - 'pull_request_id': pull_request_id, - 'cntrb_id': None, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } + for reviewer in x["suggested_approvers"]: - reviewer_dicts.append(reviewer_dict) + reviewer_dict = { + 'pull_request_id': pull_request_id, + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + reviewer_dicts.append(reviewer_dict) return reviewer_dicts diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index be97382eb9..c8116b33a3 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -6,7 +6,8 @@ from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, Repo +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, Repo +from augur.application.db.util import execute_session_query @celery.task(base=AugurCoreRepoCollectionTask) @@ -175,12 +176,18 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_reviewers.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") @@ -196,14 +203,20 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + all_reviewers = [] for id, values in data.items(): pull_request_id = mr_number_to_id_map[id] reviewers = extract_needed_pr_reviewer_data(values, pull_request_id, repo_id, tool_source, tool_version, data_source) - for review in reviewers: - print(review["pull_request_id"]) + all_reviewers += reviewers + + # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers + # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] + # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + + @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: From a7c319567849545c6bffcb3218ac5790594af81e Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 13:05:59 -0600 Subject: [PATCH 16/25] Add processing for mr commits Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 16 ++++++++++ augur/tasks/gitlab/merge_request_task.py | 40 +++++++++++++++++++++--- augur/tasks/start_tasks.py | 18 +++++------ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 6e0e2ea840..ff7befeafa 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -756,6 +756,22 @@ def extract_needed_pr_reviewer_data(data: List[dict], pull_request_id, repo_id: return reviewer_dicts +def extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): + + commit = { + 'pull_request_id': pull_request_id, + 'pr_cmt_sha': commit['id'], + 'pr_cmt_node_id': None, + 'pr_cmt_message': commit['message'], + 'repo_id': repo_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + return commit + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index c8116b33a3..fa590eeab0 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,9 +4,9 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_pr_commit_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, Repo +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestCommit, Repo from augur.application.db.util import execute_session_query @@ -226,17 +226,49 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_comments.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") if commits: logger.info(f"Length of merge request commits: {len(commits)}") - logger.info(f"Mr commit: {commits[0]}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request commits") +def process_mr_commits(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr Commit Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_commits = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + for commit in values: + + all_commits.append(extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + + pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] + augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + + + @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index b9cb5dd8b6..1ad4769d90 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -102,17 +102,17 @@ def primary_repo_collect_phase_gitlab(repo_git): jobs = group( chain(collect_gitlab_merge_requests.si(repo_git), group( - collect_merge_request_comments.s(repo_git), - collect_merge_request_reviewers.s(repo_git), - collect_merge_request_metadata.s(repo_git), + #collect_merge_request_comments.s(repo_git), + #collect_merge_request_reviewers.s(repo_git), + #collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), - collect_merge_request_files.s(repo_git) + #collect_merge_request_files.s(repo_git), + # collect_merge_request_events.si(repo_git), )), - chain(collect_gitlab_issues.si(repo_git), group( - collect_gitlab_issue_comments.s(repo_git) - )), - collect_gitlab_issue_events.si(repo_git), - collect_merge_request_events.si(repo_git), + # chain(collect_gitlab_issues.si(repo_git), group( + # collect_gitlab_issue_comments.s(repo_git), + # collect_gitlab_issue_events.si(repo_git), + # )), ) return jobs From 1ca0855be599d31336bfb83c0586cd74462abd06 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 13:20:27 -0600 Subject: [PATCH 17/25] Add processing for mr files Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 29 +++++++++++++++++++ augur/tasks/gitlab/merge_request_task.py | 37 +++++++++++++++++++++--- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index ff7befeafa..ba58eeed3e 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -772,6 +772,35 @@ def extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, return commit +def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source): + + files = [] + + changes = gitlab_file_data["changes"] + for file_changes in changes: + try: + deletes = int(file_changes['diff'].split('@@')[1].strip().split(' ')[0].split(',')[1]) + adds = int(file_changes['diff'].split('@@')[1].strip().split(' ')[1].split(',')[1]) + except: + deletes = 0 + adds = 0 + + file_dict = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_file_additions': adds, + 'pr_file_deletions': deletes, + 'pr_file_path': file_changes['old_path'], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + files.append(file_dict) + + return files + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index fa590eeab0..759e82d083 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,9 +4,9 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_pr_commit_data +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_pr_commit_data, extract_needed_mr_file_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestCommit, Repo +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestCommit, PullRequestFile, Repo from augur.application.db.util import execute_session_query @@ -264,6 +264,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): all_commits.append(extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -277,15 +278,43 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_comments.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") if files: logger.info(f"Length of merge request files: {len(files)}") - logger.info(f"Mr file: {files[0]}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request files") + +def process_mr_files(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr files Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_files = [] + for id, gitlab_file_data in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_files.extend(extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") + pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] + augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): From b45db1ef56cd90a8dc2fbe8966a7d07bb4ee4bbd Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 13:37:47 -0600 Subject: [PATCH 18/25] Add processing for mr metadata Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 42 +++++++++++++++++++++++- augur/tasks/gitlab/merge_request_task.py | 39 ++++++++++++++++++---- augur/tasks/start_tasks.py | 2 +- 3 files changed, 75 insertions(+), 8 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index ba58eeed3e..6fa1bce16e 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -756,7 +756,7 @@ def extract_needed_pr_reviewer_data(data: List[dict], pull_request_id, repo_id: return reviewer_dicts -def extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): +def extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): commit = { 'pull_request_id': pull_request_id, @@ -801,6 +801,46 @@ def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool return files +def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, tool_version, data_source): + + head = {'sha': mr_dict['diff_refs']['head_sha'], + 'ref': mr_dict['target_branch'], + 'label': str(mr_dict['target_project_id']) + ':' + mr_dict['target_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['target_project_id']) + } + + base = {'sha': mr_dict['diff_refs']['base_sha'], + 'ref': mr_dict['source_branch'], + 'label': str(mr_dict['source_project_id']) + ':' + mr_dict['source_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['source_project_id']) + } + + pr_meta_dict = { + 'head': head, + 'base': base + } + all_meta = [] + for pr_side, pr_meta_data in pr_meta_dict.items(): + pr_meta = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_head_or_base': pr_side, + 'pr_src_meta_label': pr_meta_data['label'], + 'pr_src_meta_ref': pr_meta_data['ref'], + 'pr_sha': pr_meta_data['sha'], + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + all_meta.append(pr_meta) + + return all_meta + + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 759e82d083..4f92385871 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,9 +4,9 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_pr_commit_data, extract_needed_mr_file_data +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestCommit, PullRequestFile, Repo +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, Repo from augur.application.db.util import execute_session_query @@ -156,17 +156,44 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_metadata.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") if metadata_list: logger.info(f"Length of merge request metadata: {len(metadata_list)}") - logger.info(f"Mr metadata: {metadata_list[0]}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request metadata") - +def process_mr_metadata(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr Metadata Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_metadata = [] + for id, metadata in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") + pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] + augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + @celery.task(base=AugurCoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: @@ -261,7 +288,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): for commit in values: - all_commits.append(extract_needed_pr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + all_commits.append(extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 1ad4769d90..6c01e42b98 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -24,7 +24,7 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data -from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_events, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * From f43a9fd144c39f3260136b45a6802370fd2ec236 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 14:14:15 -0600 Subject: [PATCH 19/25] Add processing for issues messages Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 24 ++++++++ augur/tasks/gitlab/issues_task.py | 89 +++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 7 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 6fa1bce16e..503fd58b60 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -840,8 +840,32 @@ def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, t return all_meta +def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + message_ref_dict = { + 'issue_id': issue_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'issue_msg_ref_src_comment_id': int(message['id']), + 'issue_msg_ref_src_node_id': None, + 'repo_id': repo_id + } + + return message_ref_dict +def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str): + comment_dict = { + "pltfrm_id": platform_id, + "msg_text": comment['body'], + "msg_timestamp": comment['created_at'], + "cntrb_id": None, + "platform_msg_id": int(comment['id']), + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + return comment_dict diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 4f035a6024..eeb0f4ac34 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -5,11 +5,12 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo from augur.application.db.util import execute_session_query +platform_id = 2 @celery.task(base=AugurCoreRepoCollectionTask) def collect_gitlab_issues(repo_git : str) -> int: @@ -158,11 +159,17 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: logger = logging.getLogger(collect_gitlab_issues.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) if comments: logger.info(f"Length of comments: {len(comments)}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab issue comments") @@ -171,26 +178,94 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): owner, repo = get_owner_repo(repo_git) - all_comments = [] + all_comments = {} issue_count = len(issue_ids) index = 1 + + comments = GitlabApiHandler(key_auth, logger) + for id in issue_ids: + if len(all_comments) > 10: + return all_comments + print(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" - comments = GitlabApiHandler(key_auth, logger) - + for page_data, page in comments.iter_pages(url): if page_data is None or len(page_data) == 0: break - all_comments += page_data + if id in all_comments: + all_comments[id].extend(page_data) + else: + all_comments[id] = page_data index += 1 return all_comments +def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): + + tool_source = "Gitlab issue comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + issue_number_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + issue_id = issue_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for issue number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + issue_message_ref_data = extract_needed_gitlab_issue_message_ref_data(message, issue_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": issue_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + issue_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + issue_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") + issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + From 3c096d72538dc8a663626872bca171a00e9ff938 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 14:26:44 -0600 Subject: [PATCH 20/25] Add processing for mr messages Signed-off-by: Andrew Brain --- augur/application/db/data_parse.py | 16 +++++ augur/tasks/gitlab/merge_request_task.py | 76 ++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index 503fd58b60..3127086d4f 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -869,3 +869,19 @@ def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: } return comment_dict + +# retrieve only the needed data for pr labels from the api response +def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + + pr_msg_ref = { + 'pull_request_id': pull_request_id, + 'pr_message_ref_src_comment_id': comment['id'], + 'repo_id': repo_id, + 'pr_message_ref_src_node_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return pr_msg_ref + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 4f92385871..cab648fdb3 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,11 +4,12 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_pr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, Repo +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message from augur.application.db.util import execute_session_query +platform_id = 2 @celery.task(base=AugurCoreRepoCollectionTask) def collect_gitlab_merge_requests(repo_git: str) -> int: @@ -134,18 +135,81 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_comments.__name__) with GitlabTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") if comments: logger.info(f"Length of merge request comments: {len(comments)}") - logger.info(f"Mr comment: {comments[0]}") - #issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request comments") +def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): + + tool_source = "Gitlab mr comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + pull_request_id = mr_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for mr number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": mr_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, repo_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + mr_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + mr_message_ref_dicts.append(message_ref_data) + logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") + mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] + augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -347,7 +411,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): all_data = {} - issue_count = len(ids) + mr_count = len(ids) index = 1 api_handler = GitlabApiHandler(key_auth, logger) @@ -356,7 +420,7 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r if len(all_data) > 10: return all_data - print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {issue_count}") + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {mr_count}") formatted_url = url.format(id=id) if response_type == "dict": From 99e94c91622d803d6053d5b3495a0e859ae8d6a6 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 14:41:08 -0600 Subject: [PATCH 21/25] Comment out assignee inserts and update tasks that are running Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 4 ++-- augur/tasks/gitlab/merge_request_task.py | 4 ++-- augur/tasks/start_tasks.py | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index eeb0f4ac34..5ffc633d0e 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -144,8 +144,8 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. - issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + # issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + # augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index cab648fdb3..adce8f60c7 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -116,8 +116,8 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data - mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 6c01e42b98..c11ea76d53 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -102,17 +102,17 @@ def primary_repo_collect_phase_gitlab(repo_git): jobs = group( chain(collect_gitlab_merge_requests.si(repo_git), group( - #collect_merge_request_comments.s(repo_git), + collect_merge_request_comments.s(repo_git), #collect_merge_request_reviewers.s(repo_git), - #collect_merge_request_metadata.s(repo_git), + collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), - #collect_merge_request_files.s(repo_git), - # collect_merge_request_events.si(repo_git), + collect_merge_request_files.s(repo_git), + collect_gitlab_merge_request_events.si(repo_git), )), - # chain(collect_gitlab_issues.si(repo_git), group( - # collect_gitlab_issue_comments.s(repo_git), - # collect_gitlab_issue_events.si(repo_git), - # )), + chain(collect_gitlab_issues.si(repo_git), group( + #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_events.si(repo_git), + # )), ) return jobs From a0234084745cfbf44336c9838e1d919fc24e4d72 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 14:42:07 -0600 Subject: [PATCH 22/25] comment out message collection Signed-off-by: Andrew Brain --- augur/tasks/start_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index c11ea76d53..c09d89dbba 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -102,7 +102,7 @@ def primary_repo_collect_phase_gitlab(repo_git): jobs = group( chain(collect_gitlab_merge_requests.si(repo_git), group( - collect_merge_request_comments.s(repo_git), + #collect_merge_request_comments.s(repo_git), #collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), From ba24d20dc362ced5698ea48d282d56c1e2664d38 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 14:45:06 -0600 Subject: [PATCH 23/25] Remove logic that stopped the collection early Signed-off-by: Andrew Brain --- augur/tasks/gitlab/issues_task.py | 3 --- augur/tasks/gitlab/merge_request_task.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 5ffc633d0e..7f0c7787ee 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -186,9 +186,6 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): for id in issue_ids: - if len(all_comments) > 10: - return all_comments - print(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index adce8f60c7..6d69def093 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -416,9 +416,6 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r api_handler = GitlabApiHandler(key_auth, logger) for id in ids: - - if len(all_data) > 10: - return all_data print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {mr_count}") formatted_url = url.format(id=id) From 13eb4498b31f386754133c2f3718231e1fec3582 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 15:21:42 -0600 Subject: [PATCH 24/25] Fix small bugs Signed-off-by: Andrew Brain --- augur/tasks/gitlab/merge_request_task.py | 4 ++-- augur/tasks/start_tasks.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 6d69def093..bb27ecdafb 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -314,7 +314,7 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) - logger = logging.getLogger(collect_merge_request_comments.__name__) + logger = logging.getLogger(collect_merge_request_commits.__name__) with GitlabTaskManifest(logger) as manifest: augur_db = manifest.augur_db @@ -366,7 +366,7 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) - logger = logging.getLogger(collect_merge_request_comments.__name__) + logger = logging.getLogger(collect_merge_request_files.__name__) with GitlabTaskManifest(logger) as manifest: augur_db = manifest.augur_db diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index c09d89dbba..e13a2a9b83 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -106,13 +106,13 @@ def primary_repo_collect_phase_gitlab(repo_git): #collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), - collect_merge_request_files.s(repo_git), + #collect_merge_request_files.s(repo_git), collect_gitlab_merge_request_events.si(repo_git), )), chain(collect_gitlab_issues.si(repo_git), group( #collect_gitlab_issue_comments.s(repo_git), collect_gitlab_issue_events.si(repo_git), - # )), + )), ) return jobs From 05b53ecc1ac78efe88a7e8c586f3fb5c158168c4 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 9 Dec 2023 15:40:07 -0600 Subject: [PATCH 25/25] Fix bug in gitlab api handler Signed-off-by: Andrew Brain --- augur/tasks/gitlab/gitlab_api_handler.py | 3 +++ augur/tasks/start_tasks.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py index 3a463127bb..8f111d3c48 100644 --- a/augur/tasks/gitlab/gitlab_api_handler.py +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -171,6 +171,8 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. response = hit_api(self.key_manager, url, self.logger, timeout) + num_attempts += 1 + if response is None: if timeout_count == 10: self.logger.error(f"Request timed out 10 times for {url}") @@ -211,6 +213,7 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. return page_data, response, GitlabApiResult.SUCCESS self.logger.warning(f"Unhandled gitlab response. Status code: {response.status_code}. Body: {response.json()}") + self.logger.error("Unable to collect data in 10 attempts") diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index e13a2a9b83..10f04e40b7 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -106,7 +106,7 @@ def primary_repo_collect_phase_gitlab(repo_git): #collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), - #collect_merge_request_files.s(repo_git), + collect_merge_request_files.s(repo_git), collect_gitlab_merge_request_events.si(repo_git), )), chain(collect_gitlab_issues.si(repo_git), group(