Skip to content

Commit

Permalink
download manager improved to handle and cache all kinds of requests
Browse files Browse the repository at this point in the history
  • Loading branch information
pseusys committed Feb 13, 2023
1 parent c2f1fdc commit 887ff2b
Show file tree
Hide file tree
Showing 6 changed files with 361 additions and 362 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ADD main.py /main.py
ADD loc.py /loc.py
ADD make_bar_graph.py /make_bar_graph.py
ADD translation.json /translation.json
ADD download_manager.py /download_manager.py

ENV PATH "$PATH:/home/root/.npm-global/bin"

Expand Down
192 changes: 176 additions & 16 deletions download_manager.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,202 @@
from typing import Awaitable, Dict, Callable, Optional, Tuple
from hashlib import md5
from json import dumps
from string import Template
from typing import Awaitable, Dict, Callable, Optional

from http3 import AsyncClient
from httpx import AsyncClient
from yaml import safe_load
from github import AuthenticatedUser


async def init_download_manager():
GITHUB_API_QUERIES = {
"repositories_contributed_to": """
{
user(login: "$username") {
repositoriesContributedTo(last: 100, includeUserRepositories: true) {
nodes {
isFork
name
owner {
login
}
}
}
}
}""",
"repository_committed_dates": """
{
repository(owner: "$owner", name: "$name") {
defaultBranchRef {
target {
... on Commit {
history(first: 100, author: { id: "$id" }) {
edges {
node {
committedDate
}
}
}
}
}
}
}
}""",
"user_repository_list": """
{
user(login: "$username") {
repositories(orderBy: {field: CREATED_AT, direction: ASC}, last: 100, affiliations: [OWNER, COLLABORATOR], isFork: false) {
edges {
node {
primaryLanguage {
name
}
name
owner {
login
}
}
}
}
}
}
""",
"repository_commit_list": """
{
repository(owner: "$owner", name: "$name") {
refs(refPrefix: "refs/heads/", orderBy: {direction: DESC, field: TAG_COMMIT_DATE}, first: 100) {
edges {
node {
... on Ref {
target {
... on Commit {
history(first: 100, author: { id: "$id" }) {
edges {
node {
... on Commit {
additions
deletions
committedDate
}
}
}
}
}
}
}
}
}
}
}
}
"""
}


async def init_download_manager(waka_key: str, github_key: str, user: AuthenticatedUser):
"""
Initialize download manager:
- Setup headers for GitHub GraphQL requests.
- Launch static queries in background.
:param waka_key: WakaTime API token.
:param github_key: GitHub API token.
:param user: GitHub current user info.
"""
await DownloadManager.load_remote_resources({
"linguist": ("https://cdn.jsdelivr.net/gh/github/linguist@master/lib/linguist/languages.yml", {})
"linguist": "https://cdn.jsdelivr.net/gh/github/linguist@master/lib/linguist/languages.yml",
"waka_latest": f"https://wakatime.com/api/v1/users/current/stats/last_7_days?api_key={waka_key}",
"waka_all": f"https://wakatime.com/api/v1/users/current/all_time_since_today?api_key={waka_key}",
"github_stats": f"https://github-contributions.vercel.app/api/v1/{user.login}"
}, {
"Authorization": f"Bearer {github_key}"
})


class DownloadManager:
_client = AsyncClient()
_REMOTE_RESOURCES = dict()
"""
Class for handling and caching all kinds of requests.
There considered to be two types of queries:
- Static queries: queries that don't require many arguments that should be executed once
Example: queries to WakaTime API or to GitHub linguist
- Dynamic queries: queries that require many arguments and should be executed multiple times
Example: GraphQL queries to GitHub API
DownloadManager launches all static queries asynchronously upon initialization and caches their results.
It also executes dynamic queries upon request and caches result.
"""
_client = AsyncClient(timeout=60.0)
_REMOTE_RESOURCES_CACHE = dict()

@staticmethod
async def load_remote_resources(resources: Dict[str, Tuple[str, Dict]]):
for resource, (url, params) in resources.items():
DownloadManager._REMOTE_RESOURCES[resource] = DownloadManager._client.get(url, **params)
async def load_remote_resources(resources: Dict[str, str], github_headers: Dict[str, str]):
"""
Prepare DownloadManager to launch GitHub API queries and launch all static queries.
:param resources: Dictionary of static queries, "IDENTIFIER": "URL".
:param github_headers: Dictionary of headers for GitHub API queries.
"""
for resource, url in resources.items():
DownloadManager._REMOTE_RESOURCES_CACHE[resource] = DownloadManager._client.get(url)
DownloadManager._client.headers = github_headers

This comment has been minimized.

Copy link
@anmol098

anmol098 Feb 25, 2023

Owner

hi @pseusys you are setting the header here with github token and i can see this header is being passed to other apis also other then github which i think is a security concern

This comment has been minimized.

Copy link
@pseusys

pseusys Feb 25, 2023

Author Collaborator

No, it's not: as you can see, all the other resources are queried before the headers parameter is being set, after this line only GitHub API calls are executed.

This comment has been minimized.

Copy link
@anmol098

anmol098 Feb 25, 2023

Owner

I have verified github token was sent to Waka and linguist i have fixed in last commit at #384

This comment has been minimized.

Copy link
@pseusys

pseusys Feb 25, 2023

Author Collaborator

Please, revert the #381 changes from the branch, I'm unable to review the PR


@staticmethod
async def _get_remote_resource(resource: str, convertor: Optional[Callable[[bytes], str]]) -> Dict:
if isinstance(DownloadManager._REMOTE_RESOURCES[resource], Awaitable):
res = await DownloadManager._REMOTE_RESOURCES[resource]
async def _get_remote_resource(resource: str, convertor: Optional[Callable[[bytes], Dict]]) -> Dict:
"""
Receive execution result of static query, wait for it if necessary.
If the query wasn't cached previously, cache it.
NB! Caching is done before response parsing - to throw exception on accessing cached erroneous response.
:param resource: Static query identifier.
:param convertor: Optional function to convert `response.contents` to dict.
By default `response.json()` is used.
:return: Response dictionary.
"""
if isinstance(DownloadManager._REMOTE_RESOURCES_CACHE[resource], Awaitable):
res = await DownloadManager._REMOTE_RESOURCES_CACHE[resource]
DownloadManager._REMOTE_RESOURCES_CACHE[resource] = res
if res.status_code == 200:
if convertor is None:
DownloadManager._REMOTE_RESOURCES[resource] = res.json()
print(res.json())
return res.json()
else:
DownloadManager._REMOTE_RESOURCES[resource] = convertor(res.content)
return convertor(res.content)
else:
raise Exception(f"Query '{res.url}' failed to run by returning code of {res.status_code}: {res.json()}")
return DownloadManager._REMOTE_RESOURCES[resource]

@staticmethod
async def get_remote_json(resource: str) -> Dict:
"""
Shortcut for `_get_remote_resource` to return JSON response data.
:param resource: Static query identifier.
:return: Response JSON dictionary.
"""
return await DownloadManager._get_remote_resource(resource, None)

@staticmethod
async def get_remote_yaml(resource: str) -> Dict:
"""
Shortcut for `_get_remote_resource` to return YAML response data.
:param resource: Static query identifier.
:return: Response YAML dictionary.
"""
return await DownloadManager._get_remote_resource(resource, safe_load)

@staticmethod
async def get_remote_graphql(query: str, **kwargs) -> Dict:
"""
Execute GitHub GraphQL API query.
The queries are defined in `GITHUB_API_QUERIES`, all parameters should be passed as kwargs.
If the query wasn't cached previously, cache it. Cache query by its identifier + parameters hash.
NB! Caching is done before response parsing - to throw exception on accessing cached erroneous response.
Parse and return response as JSON.
:param query: Dynamic query identifier.
:param kwargs: Parameters for substitution of variables in dynamic query.
:return: Response JSON dictionary.
"""
key = f"{query}_{md5(dumps(kwargs, sort_keys=True).encode('utf-8')).digest()}"
if key not in DownloadManager._REMOTE_RESOURCES_CACHE:
res = await DownloadManager._client.post("https://api.github.com/graphql", json={
"query": Template(GITHUB_API_QUERIES[query]).substitute(kwargs)
})
DownloadManager._REMOTE_RESOURCES_CACHE[key] = res
else:
res = DownloadManager._REMOTE_RESOURCES_CACHE[key]
if res.status_code == 200:
return res.json()
else:
raise Exception(f"Query '{query}' failed to run by returning code of {res.status_code}: {res.json()}")
72 changes: 24 additions & 48 deletions loc.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,44 @@
import re
import os
import base64
import requests
from github import Github, InputGitAuthor
from asyncio import sleep

from github import Github, InputGitAuthor, AuthenticatedUser
import datetime
from string import Template
import matplotlib.pyplot as plt
from io import StringIO, BytesIO
from dotenv import load_dotenv
import time

from download_manager import DownloadManager
from make_bar_graph import BarGraph


class LinesOfCode:

def __init__(self, id, username, ghtoken, repositoryData, ignored_repos):
self.id = id
self.username = username

def __init__(self, user: AuthenticatedUser, ghtoken, repositoryData, ignored_repos):
self.g = Github(ghtoken)
self.headers = {"Authorization": "Bearer " + ghtoken}
self.user = user
self.repositoryData = repositoryData
self.ignored_repos = ignored_repos

def calculateLoc(self):
async def calculateLoc(self):
result = self.repositoryData
yearly_data = {}
for repo in result['data']['user']['repositories']['edges']:
total = len(result['data']['user']['repositories']['edges'])
for ind, repo in enumerate(result['data']['user']['repositories']['edges']):
if repo['node']['name'] not in self.ignored_repos:
self.getCommitStat(repo['node'], yearly_data)
time.sleep(0.7)
print(f"{ind}/{total}", "Retrieving repo:", repo['node']["owner"]["login"], repo['node']['name'])
await self.getCommitStat(repo['node'], yearly_data)
await sleep(0.7)
return yearly_data

def plotLoc(self, yearly_data):
async def plotLoc(self, yearly_data):
graph = BarGraph(yearly_data)
graph.build_graph()
await graph.build_graph()
self.pushChart()

def run_query_v3(self, endPoint):
# print(endPoint)
request = requests.get(endPoint, headers=self.headers)
if request.status_code == 401:
raise Exception("Invalid token {}.".format(request.status_code))
elif request.status_code == 204:
return []
else:
return request.json()

def getQuarter(self, timeStamp):
month = datetime.datetime.fromisoformat(timeStamp).month
if month >= 1 and month <= 3:
Expand All @@ -59,45 +50,30 @@ def getQuarter(self, timeStamp):
elif month >= 10 and month <= 12:
return 4

def getCommitStat(self, repoDetails, yearly_data):
commitsURL = 'https://api.github.com/repos/' + repoDetails['nameWithOwner'] + '/commits'
filteredCommitsEndPoint = commitsURL + '?author=' + self.username
filteredCommitsResult = self.run_query_v3(filteredCommitsEndPoint)
# This ignores the error message you get when you try to list commits for an empty repository
if not type(filteredCommitsResult) == list:
async def getCommitStat(self, repoDetails, yearly_data):
commit_data = await DownloadManager.get_remote_graphql("repository_commit_list", owner=repoDetails["owner"]["login"], name=repoDetails['name'], id=self.user.node_id)

if commit_data["data"]["repository"] is None:
print("\tSkipping:", repoDetails['name'])
return
this_year = datetime.datetime.utcnow().year

for i in range(len(filteredCommitsResult)):
iso_date = filteredCommitsResult[i]["commit"]["author"]["date"]
date = re.search(r'\d+-\d+-\d+', iso_date).group(0)
for commit in [commit["node"] for branch in commit_data["data"]["repository"]["refs"]["edges"] for commit in branch["node"]["target"]["history"]["edges"]]:
date = re.search(r'\d+-\d+-\d+', commit["committedDate"]).group(0)
curr_year = datetime.datetime.fromisoformat(date).year
# if curr_year != this_year:

individualCommitEndPoint = commitsURL + '/' + filteredCommitsResult[i]["sha"]
individualCommitResult = self.run_query_v3(individualCommitEndPoint)

quarter = self.getQuarter(date)
if repoDetails['primaryLanguage'] is not None:

if repoDetails['primaryLanguage'] is not None:
if curr_year not in yearly_data:
yearly_data[curr_year] = {}
if quarter not in yearly_data[curr_year]:
yearly_data[curr_year][quarter] = {}
if repoDetails['primaryLanguage']['name'] not in yearly_data[curr_year][quarter]:
yearly_data[curr_year][quarter][repoDetails['primaryLanguage']['name']] = 0
yearly_data[curr_year][quarter][repoDetails['primaryLanguage']['name']] += (individualCommitResult["stats"]["additions"] - individualCommitResult["stats"]['deletions'])

# to find total
yearly_data[curr_year][quarter][repoDetails['primaryLanguage']['name']] += (commit["additions"] - commit["deletions"])

# if 'total' not in yearly_data[curr_year]:
# yearly_data[curr_year]['total']={}
# if repoDetails['primaryLanguage']['name'] not in yearly_data[curr_year]['total']:
# yearly_data[curr_year]['total'][repoDetails['primaryLanguage']['name']]=0
# yearly_data[curr_year]['total'][repoDetails['primaryLanguage']['name']]+=(result[i][1]+result[i][2])

def pushChart(self):
repo = self.g.get_repo(f"{self.username}/{self.username}")
repo = self.g.get_repo(f"{self.user.login}/{self.user.login}")
committer = InputGitAuthor('readme-bot', '41898282+github-actions[bot]@users.noreply.github.com')
with open('bar_graph.png', 'rb') as input_file:
data = input_file.read()
Expand Down
Loading

0 comments on commit 887ff2b

Please sign in to comment.