From a34bbff873a957d38c751a16ce71c5986b001444 Mon Sep 17 00:00:00 2001 From: Miranda Mundt Date: Wed, 31 Jan 2024 17:53:07 -0700 Subject: [PATCH 1/4] Create contributors data gathering script --- scripts/admin/contributors.py | 191 ++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 scripts/admin/contributors.py diff --git a/scripts/admin/contributors.py b/scripts/admin/contributors.py new file mode 100644 index 00000000000..2a23c86ed88 --- /dev/null +++ b/scripts/admin/contributors.py @@ -0,0 +1,191 @@ +""" +This script is intended to query the GitHub REST API and get contributor +information for a given time period. +""" + +import sys +import pprint + +from datetime import datetime +from os import environ +from time import perf_counter +from github import Github, Auth + + +def collect_contributors(repository, start_date, end_date): + """ + Return contributor information for a repository in a given timeframe + + Parameters + ---------- + repository : String + The org/repo combination for target repository (GitHub). E.g., + IDAES/idaes-pse + start_date : String + Start date in YYYY-MM-DD. + end_date : String + End date in YYYY-MM-DD. + + Returns + ------- + contributor_information : Dict + A dictionary with contributor information including Authors, Reviewers, + Committers, and Pull Requests. + + """ + # Create data structure + contributor_information = {} + contributor_information['Pull Requests'] = {} + contributor_information['Authors'] = {} + contributor_information['Reviewers'] = {} + contributor_information['Commits'] = {} + # Collect the authorization token from the user's environment + token = environ.get('GH_TOKEN') + auth_token = Auth.Token(token) + # Create a connection to GitHub + gh = Github(auth=auth_token) + # Create a repository object for the requested repository + repo = gh.get_repo(repository) + commits = repo.get_commits(since=start_date, until=end_date) + # Search the commits between the two dates for those that match the string; + # this is the default pull request merge message. If a team uses a custom + # message, this will not work. + merged_prs = [ + int( + commit.commit.message.replace('Merge pull request #', '').split(' from ')[0] + ) + for commit in commits + if commit.commit.message.startswith("Merge pull request") + ] + # Count the number of commits from each person within the two dates + for commit in commits: + try: + if commit.author.login in contributor_information['Commits'].keys(): + contributor_information['Commits'][commit.author.login] += 1 + else: + contributor_information['Commits'][commit.author.login] = 1 + except AttributeError: + # Sometimes GitHub returns an author who doesn't have a handle, + # which seems impossible but happens. In that case, we just record + # their "human-readable" name + if commit.commit.author.name in contributor_information['Commits'].keys(): + contributor_information['Commits'][commit.commit.author.name] += 1 + else: + contributor_information['Commits'][commit.commit.author.name] = 1 + + author_tags = set() + reviewer_tags = set() + for num in merged_prs: + try: + # sometimes the commit messages can lie and give a PR number + # for a different repository fork/branch. + # We try to query it, and if it doesn't work, whatever, move on. + pr = repo.get_pull(num) + except: + continue + # Sometimes the user does not have a handle recorded by GitHub. + # In this case, we replace it with "NOTFOUND" so the person running + # the code knows to go inspect it manually. + author_tag = pr.user.login + if author_tag is None: + author_tag = "NOTFOUND" + # Count the number of PRs authored by each person + if author_tag in author_tags: + contributor_information['Authors'][author_tag] += 1 + else: + contributor_information['Authors'][author_tag] = 1 + author_tags.add(author_tag) + + # Now we inspect all of the reviews to see who engaged in reviewing + # this specific PR + reviews = pr.get_reviews() + review_tags = set(review.user.login for review in reviews) + # Count how many PRs this person has reviewed + for tag in review_tags: + if tag in reviewer_tags: + contributor_information['Reviewers'][tag] += 1 + else: + contributor_information['Reviewers'][tag] = 1 + reviewer_tags.update(review_tags) + contributor_information['Pull Requests'][num] = { + 'author': author_tag, + 'reviewers': review_tags, + } + # This portion replaces tags with human-readable names, if they are present, + # so as to remove the step of "Who does that handle belong to?" + all_tags = author_tags.union(reviewer_tags) + tag_name_map = {} + for tag in all_tags: + if tag in tag_name_map.keys(): + continue + name = gh.search_users(tag + ' in:login')[0].name + # If they don't have a name listed, just keep the tag + if name is not None: + tag_name_map[tag] = name + for key in tag_name_map.keys(): + if key in contributor_information['Authors'].keys(): + contributor_information['Authors'][tag_name_map[key]] = ( + contributor_information['Authors'].pop(key) + ) + if key in contributor_information['Reviewers'].keys(): + contributor_information['Reviewers'][tag_name_map[key]] = ( + contributor_information['Reviewers'].pop(key) + ) + return contributor_information + + +if __name__ == '__main__': + if len(sys.argv) != 4: + print(f"Usage: {sys.argv[0]} ") + print( + " : the GitHub organization/repository combo (e.g., Pyomo/pyomo)" + ) + print( + " : date from which to start exploring contributors in YYYY-MM-DD" + ) + print( + " : date at which to stop exploring contributors in YYYY-MM-DD" + ) + print("") + print( + "ALSO REQUIRED: Please generate a GitHub token (with repo permissions) and export to the environment variable GH_TOKEN." + ) + print(" Visit GitHub's official documentation for more details.") + sys.exit(1) + repository = sys.argv[1] + try: + start = sys.argv[2].split('-') + year = int(start[0]) + try: + month = int(start[1]) + except SyntaxError: + month = int(start[1][1]) + try: + day = int(start[2]) + except SyntaxError: + day = int(start[2][1]) + start_date = datetime(year, month, day) + except: + print("Ensure that the start date is in YYYY-MM-DD format.") + sys.exit(1) + try: + end = sys.argv[3].split('-') + year = int(end[0]) + try: + month = int(end[1]) + except SyntaxError: + month = int(end[1][1]) + try: + day = int(end[2]) + except SyntaxError: + day = int(end[2][1]) + end_date = datetime(year, month, day) + except: + print("Ensure that the end date is in YYYY-MM-DD format.") + sys.exit(1) + tic = perf_counter() + contrib_info = collect_contributors(repository, start_date, end_date) + toc = perf_counter() + print(f"\nCOLLECTION COMPLETE. Time to completion: {toc - tic:0.4f} seconds") + print(f"\nContributors between {sys.argv[2]} and {sys.argv[3]}:") + pprint.pprint(contrib_info) From d61a5cb82e64052f77ef046f609c82625caf89f0 Mon Sep 17 00:00:00 2001 From: Miranda Mundt Date: Thu, 1 Feb 2024 11:28:10 -0700 Subject: [PATCH 2/4] Add README and copyright; update commit search for squashed commit regex --- scripts/admin/README.md | 28 ++++++++++++++++++++ scripts/admin/contributors.py | 50 ++++++++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 scripts/admin/README.md diff --git a/scripts/admin/README.md b/scripts/admin/README.md new file mode 100644 index 00000000000..50ad2020b94 --- /dev/null +++ b/scripts/admin/README.md @@ -0,0 +1,28 @@ +# Contributors Script + +The `contributors.py` script is intended to be used to determine contributors +to a public GitHub repository within a given time frame. + +## Requirements + +1. Python 3.7+ +1. [PyGithub](https://pypi.org/project/PyGithub/) +1. A [GitHub Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) with `repo` access, exported to the environment variable `GH_TOKEN` + +## Usage + +``` +Usage: contributors.py + : the GitHub organization/repository combo (e.g., Pyomo/pyomo) + : date from which to start exploring contributors in YYYY-MM-DD + : date at which to stop exploring contributors in YYYY-MM-DD + +ALSO REQUIRED: Please generate a GitHub token (with repo permissions) and export to the environment variable GH_TOKEN. + Visit GitHub's official documentation for more details. +``` + +## Results + +A list of contributors will print to the terminal upon completion. More detailed +information, including authors, committers, reviewers, and pull requests, can +be found in the `contributors-start_date-end_date.json` generated file. diff --git a/scripts/admin/contributors.py b/scripts/admin/contributors.py index 2a23c86ed88..3b416b632cd 100644 --- a/scripts/admin/contributors.py +++ b/scripts/admin/contributors.py @@ -1,10 +1,22 @@ +# ___________________________________________________________________________ +# +# Pyomo: Python Optimization Modeling Objects +# Copyright (c) 2008-2022 +# National Technology and Engineering Solutions of Sandia, LLC +# Under the terms of Contract DE-NA0003525 with National Technology and +# Engineering Solutions of Sandia, LLC, the U.S. Government retains certain +# rights in this software. +# This software is distributed under the 3-clause BSD License. +# ___________________________________________________________________________ + """ This script is intended to query the GitHub REST API and get contributor information for a given time period. """ import sys -import pprint +import re +import json from datetime import datetime from os import environ @@ -57,6 +69,14 @@ def collect_contributors(repository, start_date, end_date): for commit in commits if commit.commit.message.startswith("Merge pull request") ] + if not merged_prs: + regex_pattern = '\(#.*\)' + for commit in commits: + results = re.search(regex_pattern, commit.commit.message) + try: + merged_prs.append(int(results.group().replace('(#', '').split(')')[0])) + except AttributeError: + continue # Count the number of commits from each person within the two dates for commit in commits: try: @@ -115,6 +135,7 @@ def collect_contributors(repository, start_date, end_date): # so as to remove the step of "Who does that handle belong to?" all_tags = author_tags.union(reviewer_tags) tag_name_map = {} + only_tag_available = [] for tag in all_tags: if tag in tag_name_map.keys(): continue @@ -122,6 +143,8 @@ def collect_contributors(repository, start_date, end_date): # If they don't have a name listed, just keep the tag if name is not None: tag_name_map[tag] = name + else: + only_tag_available.append(tag) for key in tag_name_map.keys(): if key in contributor_information['Authors'].keys(): contributor_information['Authors'][tag_name_map[key]] = ( @@ -131,7 +154,16 @@ def collect_contributors(repository, start_date, end_date): contributor_information['Reviewers'][tag_name_map[key]] = ( contributor_information['Reviewers'].pop(key) ) - return contributor_information + return contributor_information, tag_name_map, only_tag_available + + +def set_default(obj): + """ + Converts sets to list for JSON dump + """ + if isinstance(obj, set): + return list(obj) + raise TypeError if __name__ == '__main__': @@ -183,9 +215,19 @@ def collect_contributors(repository, start_date, end_date): except: print("Ensure that the end date is in YYYY-MM-DD format.") sys.exit(1) + print('BEGIN DATA COLLECTION... (this can take some time)') tic = perf_counter() - contrib_info = collect_contributors(repository, start_date, end_date) + contrib_info, author_name_map, tags_only = collect_contributors( + repository, start_date, end_date + ) toc = perf_counter() print(f"\nCOLLECTION COMPLETE. Time to completion: {toc - tic:0.4f} seconds") print(f"\nContributors between {sys.argv[2]} and {sys.argv[3]}:") - pprint.pprint(contrib_info) + for item in author_name_map.values(): + print(item) + print("\nOnly GitHub handles are available for the following contributors:") + for tag in tags_only: + print(tag) + json_filename = f"contributors-{sys.argv[2]}-{sys.argv[3]}.json" + with open(json_filename, 'w') as file: + json.dump(contrib_info, file, default=set_default) From 6a4b14b0cd934e88a5d413ffd3efa3ac9cf52371 Mon Sep 17 00:00:00 2001 From: Miranda Mundt Date: Thu, 1 Feb 2024 11:34:20 -0700 Subject: [PATCH 3/4] Add one more helpful print message --- scripts/admin/contributors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/admin/contributors.py b/scripts/admin/contributors.py index 3b416b632cd..3ea20f61bb3 100644 --- a/scripts/admin/contributors.py +++ b/scripts/admin/contributors.py @@ -231,3 +231,4 @@ def set_default(obj): json_filename = f"contributors-{sys.argv[2]}-{sys.argv[3]}.json" with open(json_filename, 'w') as file: json.dump(contrib_info, file, default=set_default) + print(f"\nDetailed information can be found in {json_filename}.") From e7b9cf495eed29da24ab7eec3c265567261a3422 Mon Sep 17 00:00:00 2001 From: Miranda Mundt Date: Thu, 1 Feb 2024 12:14:34 -0700 Subject: [PATCH 4/4] Add reponame to filename; check if file exists; update comments --- scripts/admin/contributors.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/scripts/admin/contributors.py b/scripts/admin/contributors.py index 3ea20f61bb3..fe5d483f16d 100644 --- a/scripts/admin/contributors.py +++ b/scripts/admin/contributors.py @@ -17,6 +17,7 @@ import sys import re import json +import os from datetime import datetime from os import environ @@ -43,6 +44,12 @@ def collect_contributors(repository, start_date, end_date): contributor_information : Dict A dictionary with contributor information including Authors, Reviewers, Committers, and Pull Requests. + tag_name_map : Dict + A dictionary that maps GitHub handles to GitHub display names (if they + exist). + only_tag_available : List + A list of the handles for contributors who do not have GitHub display names + available. """ # Create data structure @@ -60,8 +67,8 @@ def collect_contributors(repository, start_date, end_date): repo = gh.get_repo(repository) commits = repo.get_commits(since=start_date, until=end_date) # Search the commits between the two dates for those that match the string; - # this is the default pull request merge message. If a team uses a custom - # message, this will not work. + # this is the default pull request merge message. This works assuming that + # a repo does not squash commits merged_prs = [ int( commit.commit.message.replace('Merge pull request #', '').split(' from ')[0] @@ -69,6 +76,8 @@ def collect_contributors(repository, start_date, end_date): for commit in commits if commit.commit.message.startswith("Merge pull request") ] + # If the search above returned nothing, it's likely that the repo squashes + # commits when merging PRs. This is a different regex for that case. if not merged_prs: regex_pattern = '\(#.*\)' for commit in commits: @@ -185,6 +194,7 @@ def set_default(obj): print(" Visit GitHub's official documentation for more details.") sys.exit(1) repository = sys.argv[1] + repository_name = sys.argv[1].split('/')[1] try: start = sys.argv[2].split('-') year = int(start[0]) @@ -215,6 +225,9 @@ def set_default(obj): except: print("Ensure that the end date is in YYYY-MM-DD format.") sys.exit(1) + json_filename = f"contributors-{repository_name}-{sys.argv[2]}-{sys.argv[3]}.json" + if os.path.isfile(json_filename): + raise FileExistsError(f'ERROR: The file {json_filename} already exists!') print('BEGIN DATA COLLECTION... (this can take some time)') tic = perf_counter() contrib_info, author_name_map, tags_only = collect_contributors( @@ -228,7 +241,6 @@ def set_default(obj): print("\nOnly GitHub handles are available for the following contributors:") for tag in tags_only: print(tag) - json_filename = f"contributors-{sys.argv[2]}-{sys.argv[3]}.json" with open(json_filename, 'w') as file: json.dump(contrib_info, file, default=set_default) print(f"\nDetailed information can be found in {json_filename}.")