Skip to content

Commit

Permalink
feat: unite reports in the csv file
Browse files Browse the repository at this point in the history
- Save check reports in the csv file;
- Now used ruff instead of use flake8;
- Upped libraries versions for the script for time survey;
- More typing added.

Refs: #176, #158.
  • Loading branch information
Artanias authored Sep 21, 2023
1 parent 8db7da1 commit 612b5ab
Show file tree
Hide file tree
Showing 27 changed files with 559 additions and 192 deletions.
9 changes: 0 additions & 9 deletions .flake8

This file was deleted.

6 changes: 3 additions & 3 deletions .github/workflows/check_n_push_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ jobs:
with:
python-version: 3.8

- name: Lint with flake8, isort and pyright
- name: Lint with ruff and pyright
run: |
make substitute-sources
pip install $(python3 setup.py --install-requirements)
pip install -r docs/notebooks/requirements.txt
pip install pre-commit==2.20.0
pip install --requirement docs/notebooks/requirements.txt
pip install pre-commit==3.4.0
make pre-commit
docker-build-test-autotest:
Expand Down
17 changes: 4 additions & 13 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
default_language_version:
python: python3.8
repos:
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear==22.8.23
- flake8-comprehensions==3.10.0
- flake8-simplify==0.19.3
- mccabe==0.7.0
- id: ruff
- repo: local
hooks:
- id: pyright
Expand All @@ -22,4 +13,4 @@ repos:
language: node
pass_filenames: false
types: [ python ]
additional_dependencies: [ 'pyright@1.1.274' ]
additional_dependencies: [ 'pyright@1.1.305' ]
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
UTIL_VERSION := 0.3.5
UTIL_VERSION := 0.3.6
UTIL_NAME := codeplag
PWD := $(shell pwd)

Expand Down
10 changes: 5 additions & 5 deletions docs/notebooks/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
matplotlib~=3.6.1
numpy~=1.23.1
pandas~=1.4.3
python-decouple~=3.6
scipy~=1.9.3
matplotlib~=3.7.3
numpy~=1.23.5
pandas~=2.0.3
python-decouple~=3.8
scipy~=1.10.1
118 changes: 90 additions & 28 deletions docs/notebooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,41 @@
import re
from datetime import datetime
from time import perf_counter
from typing import Literal, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from decouple import Config, RepositoryEnv
from scipy.optimize import curve_fit

from codeplag.algorithms.featurebased import counter_metric, struct_compare
from codeplag.algorithms.stringbased import gst
from codeplag.algorithms.tokenbased import value_jakkar_coef
from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast
from decouple import Config, RepositoryEnv
from scipy.optimize import curve_fit
from webparsers.github_parser import GitHubParser


def square_func(x, a, b, c):
def square_func(x: float, a: float, b: float, c: float) -> float:
return a * x**2 + b * x + c


def cube_func(x, a, b, c, d):
def cube_func(x: float, a: float, b: float, c: float, d: float) -> float:
return a * x**3 + b * x**2 + c * x + d


def quart_func(x, a, b, c, d, e):
def quart_func(x: float, a: float, b: float, c: float, d: float, e: float) -> float:
return a * x**4 + b * x**3 + c * x**2 + d * x + e


def remove_unnecessary_blank_lines(source_code):
def remove_unnecessary_blank_lines(source_code: str) -> str:
pattern = r"\n+"
return re.sub(pattern, "\n", source_code)


def get_data_from_dir(path='./data', max_count_lines=None):
def get_data_from_dir(
path: str = './data',
max_count_lines: Optional[int] = None
) -> pd.DataFrame:
df = pd.DataFrame()
for filename in os.listdir(path):
if not re.search(r'.csv$', filename):
Expand All @@ -48,7 +51,7 @@ def get_data_from_dir(path='./data', max_count_lines=None):
return df


def save_works_from_repo_url(url, check_policy=True):
def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
current_repo_name = url.split('/')[-1]
env_config = Config(RepositoryEnv('../../.env'))
gh = GitHubParser(
Expand All @@ -57,7 +60,7 @@ def save_works_from_repo_url(url, check_policy=True):
access_token=env_config.get('ACCESS_TOKEN')
)
files = list(gh.get_files_generator_from_repo_url(url))
files = [(remove_unnecessary_blank_lines(file[0]), file[1]) for file in files]
files = [(remove_unnecessary_blank_lines(file.code), file.link) for file in files]

df = pd.DataFrame(
{
Expand All @@ -66,18 +69,25 @@ def save_works_from_repo_url(url, check_policy=True):
'extension': ['py'] * (len(files) - 1),
'repo_name': [current_repo_name] * (len(files) - 1),
'content_len': [len(file_[0]) for file_ in files[:-1]],
'content_len_without_blank': [len(file_[0].replace(' ', '').replace('\n', '').replace('\t', '')) for file_ in files[:-1]],
'count_lines_without_blank_lines': [len(file_[0].splitlines()) for file_ in files[:-1]]
'content_len_without_blank': [
len(file_[0].replace(' ', '').replace('\n', '').replace('\t', ''))
for file_ in files[:-1]
],
'count_lines_without_blank_lines': [
len(file_[0].splitlines()) for file_ in files[:-1]
]
}
)
df = df[df['count_lines_without_blank_lines'] > 5]
df.to_csv(os.path.join('./data/', current_repo_name + '.csv'), sep=';')


def get_time_to_meta(df, iterations=10):
def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
count_lines = []
to_meta_time = []
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
for (index, content) in df[
['content', 'link', 'count_lines_without_blank_lines']
].iterrows():
print(index, " " * 20, end='\r')
for _ in range(iterations):
tree = get_ast_from_content(content[0], content[1])
Expand All @@ -102,8 +112,14 @@ def get_time_to_meta(df, iterations=10):
return output


def plot_and_save_result(df, xlabel, ylabel, title, what,
trend='linear'):
def plot_and_save_result(
df: pd.DataFrame,
xlabel: str,
ylabel: str,
title: str,
what: str,
trend: Literal['linear', 'n^2', 'n^3', 'n^4'] = 'linear'
) -> None:
# Simple Moving average
unique_count_lines = np.unique(df.count_lines)
mean_times = []
Expand All @@ -122,19 +138,50 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
if trend == 'linear':
z = np.polyfit(unique_count_lines, mean_times, 1)
p = np.poly1d(z)
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.')
plt.plot(
unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.'
)
elif trend == 'n^2':
popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100]))
popt_cons, _ = curve_fit(
square_func,
unique_count_lines,
mean_times,
bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])
)
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.')
plt.plot(
unique_count_lines,
p(unique_count_lines),
"r--", label='Квадратичный тренд.'
)
elif trend == 'n^3':
popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
popt_cons, _ = curve_fit(
cube_func,
unique_count_lines,
mean_times,
bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])
)
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.')
plt.plot(
unique_count_lines,
p(unique_count_lines),
"r--",
label='Кубический тренд.'
)
elif trend == 'n^4':
popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
popt_cons, _ = curve_fit(
quart_func,
unique_count_lines,
mean_times,
bounds=(
[-np.inf, 0., 0., 0., 0.],
[np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]
)
)
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
else:
raise Exception(f"Incorrect tred '{trend}'.")

rolling = pd.DataFrame(
{
Expand All @@ -143,24 +190,40 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
}
)
num_window = 20
plt.plot(rolling.unique_count_lines, rolling.mean_times.rolling(window=num_window).mean(), label=f'Скользящее среднее по {num_window}ти замерам.')
plt.plot(
rolling.unique_count_lines,
rolling.mean_times.rolling(window=num_window).mean(),
label=f'Скользящее среднее по {num_window}ти замерам.'
)

plt.ylabel(ylabel, fontsize=15)
plt.xlabel(xlabel, fontsize=15)
plt.title(title, fontsize=17)
plt.legend(loc='upper left')
plt.savefig('./graphics/need_time_{}_{}.png'.format(what, datetime.now().strftime("%d%m%Y_%H%M%S")))
plt.savefig(
'./graphics/need_time_{}_{}.png'.format(
what,
datetime.now().strftime("%d%m%Y_%H%M%S")
)
)


def get_time_algorithms(df, work, iterations=5, metric='fast'):
def get_time_algorithms(
df: pd.DataFrame,
work,
iterations: int = 5,
metric: Literal['fast', 'gst', 'structure'] = 'fast'
) -> pd.DataFrame:
count_lines = []
times = []
tree1 = get_ast_from_content(work.content, work.link)
if tree1 is None:
raise Exception("Unexpected error when parsing first work.")

features1 = get_features_from_ast(tree1, work.link)
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
for (index, content) in df[
['content', 'link', 'count_lines_without_blank_lines']
].iterrows():
for _ in range(iterations):
print(index, " " * 20, end='\r')
tree2 = get_ast_from_content(content[0], content[1])
Expand Down Expand Up @@ -190,8 +253,7 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
end = perf_counter() - start
times.append(end)
else:
print('Incorrect metric!')
return 1
raise Exception('Incorrect metric!')

count_lines.append(content[2])

Expand Down
22 changes: 18 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,22 @@
[tool.isort]
profile = "black"
[project]
requires-python = ">=3.8"

[tool.black]
target-version = ["py38"]
[tool.ruff]
line-length = 97
select = [
"F", # pyflakes
"E", # pycodestyle Error
"W", # pycodestyle Warning
"I", # isort
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"SIM", # flake8-simplify
# "ERA", # eradicate
"C90", # mccabe
]

[tool.ruff.mccabe]
max-complexity = 13

[tool.pyright]
pythonVersion = "3.8"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

INSTALL_REQUIREMENTS = [
'argcomplete~=2.0.0',
'numpy~=1.23.1',
'numpy~=1.23.5',
'pandas~=1.4.3',
'ccsyspath~=1.1.0',
'clang~=16.0.1.1',
Expand Down
4 changes: 2 additions & 2 deletions src/codeplag/algorithms/featurebased.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ def counter_metric(counter1: dict, counter2: dict) -> float:
return 1.0

percent_of_same = [0, 0]
for key in counter1.keys():
for key in counter1:
if key not in counter2:
percent_of_same[1] += counter1[key]
continue
percent_of_same[0] += min(counter1[key],
counter2[key])
percent_of_same[1] += max(counter1[key],
counter2[key])
for key in counter2.keys():
for key in counter2:
if key not in counter1:
percent_of_same[1] += counter2[key]
continue
Expand Down
16 changes: 15 additions & 1 deletion src/codeplag/codeplagcli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@
from pathlib import Path
from typing import List, Optional

from codeplag.consts import EXTENSION_CHOICE, MODE_CHOICE, UTIL_NAME, UTIL_VERSION
from webparsers.types import GitHubContentUrl

from codeplag.consts import (
EXTENSION_CHOICE,
MODE_CHOICE,
REPORTS_EXTENSION_CHOICE,
UTIL_NAME,
UTIL_VERSION,
)


class CheckUniqueStore(argparse.Action):
"""Checks that the list of arguments contains no duplicates, then stores"""
Expand Down Expand Up @@ -116,6 +123,13 @@ def __init__(self):
metavar="DIRECTORY",
type=DirPath,
)
settings_modify.add_argument(
"-re",
"--reports_extension",
help="Extension of saved report files.",
type=str,
choices=REPORTS_EXTENSION_CHOICE
)
settings_modify.add_argument(
"-sp",
"--show_progress",
Expand Down
Loading

0 comments on commit 612b5ab

Please sign in to comment.