Skip to content

Commit

Permalink
Merge pull request #33 from OSLL/pyGitAPI
Browse files Browse the repository at this point in the history
Python github API.
  • Loading branch information
BurnedScarecrow authored Apr 7, 2021
2 parents 43bacb3 + 7b2599f commit c12b505
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 40 deletions.
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@

- Python PEP8 style ([guide](https://www.python.org/dev/peps/pep-0008/))

### 1.2. Taskboard

- Trello [invite](https://trello.com/invite/b/sovrr5dJ/afd614ed4dc319c14986e1792b53d896/identifying-plagiarism-in-source-code)

## 2. Install requirements

- python version 3.6+ or even 3.8+
Expand All @@ -32,7 +28,7 @@
```
$ python3 test/cplag -v
```
- Testing for python analyzer
- Testing for python analyzer (Temporarily not working)
> Test of pyplag functions
```
$ python3 test/pyplag
Expand All @@ -46,10 +42,20 @@
## 4. Work with analyzers

- python analyzer
> Compare all files in folder
```
$ python3 src/pyplag <path/to/folder/with/py/files>
```
> Compare file in folder with files in github repositories
```
$ python3 src/pyplag <path/to/file/which/compare> <reg_exp>
```
> Compare file by link on github starts with https:// with files in github repositories
```
$ python3 src/pyplag <link/to/file/which/compare> <reg_exp>
```
- C++/C analyzer
> Compare all files in folder
```
$ python3 src/cplag <path/to/folder/with/cpp/or/cc/files>
```
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ ccsyspath==1.1.0
clang==11.0
libclang==10.0.1.0
numba==0.52.0
colorama==0.3.9
colorama==0.3.9
termcolor==1.1.0
2 changes: 1 addition & 1 deletion src/github_helper/const.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ACCESS_TOKEN = ''
OWNER = ''
HEADERS = {'Authorization': "Token " + ACCESS_TOKEN}
HEADERS = {'Authorization': "Token " + ACCESS_TOKEN}
83 changes: 73 additions & 10 deletions src/github_helper/utils.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,102 @@
import requests
import numpy as np
import base64
import re

from src.github_helper.const import HEADERS, OWNER
from termcolor import colored


def get_list_of_repos():
'''
Function return list of repos which belongs to user
defined in const.py in field OWNER and list of urls to this repos
Also required to define ACCESS_TOKEN
'''
repos = []
repos_url = []

page_num = 1
url = f"https://api.github.com/users/{OWNER}/repos?page={page_num}"
url = (f"https://api.github.com/orgs/{OWNER}" +
f"/repos?page={page_num}&per_page=100")
page = requests.get(url, headers=HEADERS).json()
while page != []:
if type(page) == dict:
if 'message' in page.keys():
print(page['message'])
print()
print(colored(page['message']), 'red')
print()
exit()

for repo in page:
repos.append(repo['name'])
repos_url.append(repo['url'])

page_num += 1
url = f"https://api.github.com/users/{OWNER}/repos?page={page_num}"
url = (f"https://api.github.com/orgs/{OWNER}" +
f"/repos?page={page_num}&per_page=100")
page = requests.get(url, headers=HEADERS).json()

return repos, repos_url


def get_python_files_links(start_link):
'''
Function return list of urls to all python files in repository
start_link: str - link to repository contents in api.github.com
'''
url_links = []
req = requests.get(start_link, headers=HEADERS)
req = req.json()
if type(req) == dict:
if 'message' in req.keys():
print(req['message'])
print()
print(start_link)
print(colored(req['message'], 'red'))
print()
exit()
elif type(req) == list:
for el in req:
if el['size'] != 0 and el['name'].endswith('.py') and len(el['name']) > 3:
if (el['size'] != 0 and el['name'].endswith('.py') and
len(el['name']) > 3):
url_links.append(el['url'])
continue
url_links.extend(get_python_files_links(el['url']))
elif 'size' in req.keys() and req['size'] == 0:
url_links.extend(get_python_files_links(req['url']))
if 'size' in el.keys() and el['size'] == 0:
url_links.extend(get_python_files_links(el['url']))

return url_links


def get_code(link):
'''
Function return code of python file which located at the link
link: str - link to python file in api.github.com
'''
req = requests.get(link, headers=HEADERS)
req = req.json()
if type(req) == dict:
if 'message' in req.keys():
print(req['message'])
print()
print(link)
print(colored(req['message'], 'red'))
print()
exit()

file_bytes = base64.b64decode(req['content'])
file_str = file_bytes.decode('utf-8')

return file_str


def select_repos(repos, repos_url, reg_exp):
'''
Function helps to choose needed repositories using regular expression
repos: list - names of repositories
repos_url: list -links to repositories
reg_exp: str - regular expression
'''

upprooved_repos = []
upprooved_repos_links = []
for repo, repo_url in zip(repos, repos_url):
Expand All @@ -71,3 +105,32 @@ def select_repos(repos, repos_url, reg_exp):
upprooved_repos_links.append(repo_url)

return upprooved_repos, upprooved_repos_links


def get_github_api_link(link):
'''
Function converting github link, which pointing on python
file in repository, to github api link
If name branch has "/" then algorithm crash
link: str - link to file
'''

complete_link = 'https://api.github.com/repos/'
start_ind = 0
link = link.split('/')

if len(link) < 8:
return None

for el in link:
start_ind += 1
if el == "github.com":
break

branch = link[start_ind + 3]
complete_link += '/'.join((link[start_ind:start_ind + 2] + ['contents']
+ link[start_ind + 4:]))
complete_link += '?ref={}'.format(branch)

return complete_link
92 changes: 71 additions & 21 deletions src/pyplag/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,23 @@
import ast
import os
import sys
import datetime
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.2%}'.format

from time import perf_counter
# from src.pyplag.tree import *
from src.pyplag.tree import ASTFeatures, get_AST
from src.pyplag.metric import nodes_metric, run_compare
from src.pyplag.metric import op_shift_metric, get_children_ind
from src.pyplag.metric import run_compare, get_children_ind
from src.github_helper.utils import get_list_of_repos, select_repos
from src.github_helper.utils import get_python_files_links, get_code
from src.github_helper.utils import get_github_api_link
from termcolor import colored
# from src.pyplag.metric import *

pd.options.display.float_format = '{:,.2%}'.format


def print_compare_res(metrics, total_similarity, best_shift,
def print_compare_res(metrics, total_similarity, best_shift,
matrix, struct1, struct2, to_names1, to_names2,
filename1, filename2):
ch_inds1, count_of_children1 = get_children_ind(struct1, len(struct1))
Expand Down Expand Up @@ -55,6 +56,7 @@ def print_compare_res(metrics, total_similarity, best_shift,
# 0 mode works with GitHub repositoryes
# 1 mode works with directory in user computer


directory = 'py/'
if len(sys.argv) > 2:
file_path = sys.argv[1]
Expand All @@ -69,17 +71,26 @@ def print_compare_res(metrics, total_similarity, best_shift,
elif len(sys.argv) == 1:
exit()

tree1 = None
start_eval = perf_counter()
weights = np.array([1.5, 0.8, 0.9, 0.5, 0.3], dtype=np.float32)
if mode == 0:
try:
with open(file_path) as f:
tree1 = ast.parse(f.read())
except PermissionError:
print("File denied.")
exit()
except FileNotFoundError:
print("File not found")
if file_path.startswith('https://'):
file_link = get_github_api_link(file_path)
try:
tree1 = ast.parse(get_code(file_link))
except Exception as e:
print('-' * 40)
print(colored('Not compiled: ' + file_link, 'red'))
print(colored(e.__class__.__name__, 'red'))
for el in e.args:
print(colored(el, 'red'))
print('-' * 40)
exit()
else:
tree1 = get_AST(file_path)

if tree1 is None:
exit()

features1 = ASTFeatures()
Expand All @@ -90,15 +101,44 @@ def print_compare_res(metrics, total_similarity, best_shift,
repos, repos_url = select_repos(repos, repos_url, reg_exp)
count_iter = len(repos)
for repo_url in repos_url:
print(repo_url)
url_files_in_repo = get_python_files_links(repo_url + '/contents')
inner_iter = 0
inner_iters = len(url_files_in_repo)
for url_file in url_files_in_repo:
try:
tree2 = ast.parse(get_code(url_file))
except:
print('Not compiled: ', url_file)
except IndentationError as err:
print('-' * 40)
print(colored('Not compiled: ' + url_file, 'red'))
print(colored('IdentationError: ' + err.args[0], 'red'))
print(colored('In line ' + str(err.args[1][1]), 'red'))
print('-' * 40)
continue
except SyntaxError as err:
print('-' * 40)
print(colored('Not compiled: ' + url_file, 'red'))
print(colored('SyntaxError: ' + err.args[0], 'red'))
print(colored('In line ' + str(err.args[1][1]), 'red'))
print(colored('In column ' + str(err.args[1][2]), 'red'))
print('-' * 40)
continue
except TabError as err:
print('-' * 40)
print(colored('Not compiled: ' + url_file, 'red'))
print(colored('TabError: ' + err.args[0], 'red'))
print(colored('In line ' + str(err.args[1][1]), 'red'))
print('-' * 40)
continue
except Exception as e:
print('-' * 40)
print(colored('Not compiled: ' + url_file, 'red'))
print(colored(e.__class__.__name__, 'red'))
for el in e.args:
print(colored(el, 'red'))
print('-' * 40)
continue

features2 = ASTFeatures()
features2.visit(tree2)
metrics, best_shift, matrix = run_compare(features1.structure,
Expand All @@ -115,15 +155,22 @@ def print_compare_res(metrics, total_similarity, best_shift,

if total_similarity > 0.72:
print_compare_res(metrics, total_similarity, best_shift,
matrix, features1.structure,
features2.structure, features1.from_num,
features2.from_num, file_path.split('\\')[-1],
matrix,
features1.structure,
features2.structure,
features1.from_num,
features2.from_num,
file_path.split('\\')[-1],
url_file)

inner_iter += 1
print('In repo {:.2%}, In repos {:.2%}'.format(inner_iter / inner_iters,
iteration / count_iter), end="\r")
print('In repo {:.2%}, In repos {:.2%}'.format((inner_iter /
inner_iters),
(iteration /
count_iter)),
end="\r")
iteration += 1
print(repo_url, " ... OK")
print(" " * 40, end="\r")
print('In repos {:.2%}'.format(iteration / count_iter), end="\r")
elif mode == 1:
Expand All @@ -150,7 +197,10 @@ def print_compare_res(metrics, total_similarity, best_shift,

tree1 = get_AST(filename)
tree2 = get_AST(filename2)
if tree1 is None or tree2 is None:

if tree1 is None:
break
if tree2 is None:
continue

features1 = ASTFeatures()
Expand Down
Loading

0 comments on commit c12b505

Please sign in to comment.