This repository was archived by the owner on Aug 4, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
111 lines (95 loc) · 3.63 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from github import Github, RateLimitExceededException, GithubException
import private
from base64 import b64decode
from time import sleep, time
import re
from db_utilities import add_data, read_data, check_db, read_last_insert
target_language = input("The language that you'd like to look for: ")
gcontext = Github(private.GITHUB_API_KEY)
sort_list = ['stars', 'forks']
sort_index = 0
repos = gcontext.search_repositories(query='language:' + target_language)
current_repo = 0
debug = 0
# Searches for a repo (constricted by the query)
def poll_repo():
global current_repo, repos, sort_index
if debug:
print("Searching for a new repo...")
try:
if(current_repo == 999):
sort_index += 1
repos = gcontext.search_repositories(query='language:' + target_language, sort=sort_list[sort_index])
current_repo = 0
print("Searched all... restarting.")
except RateLimitExceededException:
if debug:
print("sleeping: " + str(seconds_till_reset))
sleep(seconds_till_reset)
repos = gcontext.search_repositories(query='language:' + target_language)
current_repo = 0
print("Searched all... restarting.")
except IndexError:
print("Done.")
exit()
try: # Try to get a repo
print(current_repo)
target_repo = repos[current_repo]
if debug:
print("Repo found: " + target_repo.full_name)
except RateLimitExceededException:
seconds_till_reset = gcontext.rate_limiting_resettime - time()
if debug:
print("sleeping: " + str(seconds_till_reset))
sleep(seconds_till_reset)
target_repo = repos[current_repo]
if debug:
print("Repo found: " + target_repo.full_name)
current_repo += 1
return target_repo
# Uses regex to divide the full name into a username and a repo name
def divide_full_name(full_name):
# Uses regex to seperate the username and repo name
phrase = re.match('(.*)(\/)(.*)', full_name)
user = phrase.group(1)
proj_name = phrase.group(3)
if debug:
print("User: " + user)
print("Name: " + proj_name)
print("")
return user, proj_name
# Gets some info from the repo
def scrape_repo(repo):
try:
stars = repo.watchers # This is misleading, it is actually the number of stars
contrib = repo.get_contributors().totalCount
language = repo.language
forks = repo.forks_count
if debug:
print("Stars: " + str(stars))
print("Contrib: " + str(contrib))
print("lanuage: " + str(language))
print("Forks: " + str(forks))
except RateLimitExceededException:
seconds_till_reset = gcontext.rate_limiting_resettime - time()
if debug:
print("sleeping: " + str(seconds_till_reset))
sleep(seconds_till_reset)
stars = repo.watchers # This is misleading, it is actually the number of stars
contrib = repo.get_contributors().totalCount
language = repo.language
forks = repo.forks_count
return stars, contrib, language, forks
# Continually scrape repos and add them to the database
if __name__ == "__main__":
user_id = 1
while(True):
target_repo = poll_repo()
stars, contrib, language, forks = scrape_repo(target_repo)
user, proj_name = divide_full_name(target_repo.full_name)
if not check_db(proj_name, user):
add_data(proj_name, user, language, stars, contrib, forks)
read_last_insert()
user_id += 1
else:
print(proj_name + " by: " + user + " is already in the DB")