feat: unite reports in the csv file

- Save check reports in the csv file; - Now used ruff instead of use flake8; - Upped libraries versions for the script for time survey; - More typing added. Refs: #176, #158.
OSLL · Sep 21, 2023 · 612b5ab · 612b5ab
1 parent 8db7da1
commit 612b5ab
Show file tree

Hide file tree

Showing 27 changed files with 559 additions and 192 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.github/workflows/check_n_push_image.yml b/.github/workflows/check_n_push_image.yml
@@ -28,12 +28,12 @@ jobs:
       with:
         python-version: 3.8
 
-    - name: Lint with flake8, isort and pyright
+    - name: Lint with ruff and pyright
       run: |
         make substitute-sources
         pip install $(python3 setup.py --install-requirements)
-        pip install -r docs/notebooks/requirements.txt
-        pip install pre-commit==2.20.0
+        pip install --requirement docs/notebooks/requirements.txt
+        pip install pre-commit==3.4.0
         make pre-commit
 
   docker-build-test-autotest:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,10 @@
 default_language_version:
     python: python3.8
 repos:
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.287
     hooks:
-      - id: isort
-  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
-    hooks:
-      - id: flake8
-        additional_dependencies:
-          - flake8-bugbear==22.8.23
-          - flake8-comprehensions==3.10.0
-          - flake8-simplify==0.19.3
-          - mccabe==0.7.0
+      - id: ruff
   - repo: local
     hooks:
       - id: pyright
@@ -22,4 +13,4 @@ repos:
         language: node
         pass_filenames: false
         types: [ python ]
-        additional_dependencies: [ 'pyright@1.1.274' ]
+        additional_dependencies: [ 'pyright@1.1.305' ]
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-UTIL_VERSION            := 0.3.5
+UTIL_VERSION            := 0.3.6
 UTIL_NAME               := codeplag
 PWD                     := $(shell pwd)
 

diff --git a/docs/notebooks/requirements.txt b/docs/notebooks/requirements.txt
@@ -1,5 +1,5 @@
-matplotlib~=3.6.1
-numpy~=1.23.1
-pandas~=1.4.3
-python-decouple~=3.6
-scipy~=1.9.3
+matplotlib~=3.7.3
+numpy~=1.23.5
+pandas~=2.0.3
+python-decouple~=3.8
+scipy~=1.10.1
diff --git a/docs/notebooks/utils.py b/docs/notebooks/utils.py
@@ -2,38 +2,41 @@
 import re
 from datetime import datetime
 from time import perf_counter
+from typing import Literal, Optional
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from decouple import Config, RepositoryEnv
-from scipy.optimize import curve_fit
-
 from codeplag.algorithms.featurebased import counter_metric, struct_compare
 from codeplag.algorithms.stringbased import gst
 from codeplag.algorithms.tokenbased import value_jakkar_coef
 from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast
+from decouple import Config, RepositoryEnv
+from scipy.optimize import curve_fit
 from webparsers.github_parser import GitHubParser
 
 
-def square_func(x, a, b, c):
+def square_func(x: float, a: float, b: float, c: float) -> float:
     return a * x**2 + b * x + c
 
 
-def cube_func(x, a, b, c, d):
+def cube_func(x: float, a: float, b: float, c: float, d: float) -> float:
     return a * x**3 + b * x**2 + c * x + d
 
 
-def quart_func(x, a, b, c, d, e):
+def quart_func(x: float, a: float, b: float, c: float, d: float, e: float) -> float:
     return a * x**4 + b * x**3 + c * x**2 + d * x + e
 
 
-def remove_unnecessary_blank_lines(source_code):
+def remove_unnecessary_blank_lines(source_code: str) -> str:
     pattern = r"\n+"
     return re.sub(pattern, "\n", source_code)
 
 
-def get_data_from_dir(path='./data', max_count_lines=None):
+def get_data_from_dir(
+    path: str = './data',
+    max_count_lines: Optional[int] = None
+) -> pd.DataFrame:
     df = pd.DataFrame()
     for filename in os.listdir(path):
         if not re.search(r'.csv$', filename):
@@ -48,7 +51,7 @@ def get_data_from_dir(path='./data', max_count_lines=None):
     return df
 
 
-def save_works_from_repo_url(url, check_policy=True):
+def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
     current_repo_name = url.split('/')[-1]
     env_config = Config(RepositoryEnv('../../.env'))
     gh = GitHubParser(
@@ -57,7 +60,7 @@ def save_works_from_repo_url(url, check_policy=True):
         access_token=env_config.get('ACCESS_TOKEN')
     )
     files = list(gh.get_files_generator_from_repo_url(url))
-    files = [(remove_unnecessary_blank_lines(file[0]), file[1]) for file in files]
+    files = [(remove_unnecessary_blank_lines(file.code), file.link) for file in files]
 
     df = pd.DataFrame(
         {
@@ -66,18 +69,25 @@ def save_works_from_repo_url(url, check_policy=True):
             'extension': ['py'] * (len(files) - 1),
             'repo_name': [current_repo_name] * (len(files) - 1),
             'content_len': [len(file_[0]) for file_ in files[:-1]],
-            'content_len_without_blank': [len(file_[0].replace(' ', '').replace('\n', '').replace('\t', '')) for file_ in files[:-1]],
-            'count_lines_without_blank_lines': [len(file_[0].splitlines()) for file_ in files[:-1]]
+            'content_len_without_blank': [
+                len(file_[0].replace(' ', '').replace('\n', '').replace('\t', ''))
+                for file_ in files[:-1]
+            ],
+            'count_lines_without_blank_lines': [
+                len(file_[0].splitlines()) for file_ in files[:-1]
+            ]
         }
     )
     df = df[df['count_lines_without_blank_lines'] > 5]
     df.to_csv(os.path.join('./data/', current_repo_name + '.csv'), sep=';')
 
 
-def get_time_to_meta(df, iterations=10):
+def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
     count_lines = []
     to_meta_time = []
-    for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
+    for (index, content) in df[
+        ['content', 'link', 'count_lines_without_blank_lines']
+    ].iterrows():
         print(index, " " * 20, end='\r')
         for _ in range(iterations):
             tree = get_ast_from_content(content[0], content[1])
@@ -102,8 +112,14 @@ def get_time_to_meta(df, iterations=10):
     return output
 
 
-def plot_and_save_result(df, xlabel, ylabel, title, what,
-                         trend='linear'):
+def plot_and_save_result(
+    df: pd.DataFrame,
+    xlabel: str,
+    ylabel: str,
+    title: str,
+    what: str,
+    trend: Literal['linear', 'n^2', 'n^3', 'n^4'] = 'linear'
+) -> None:
     # Simple Moving average
     unique_count_lines = np.unique(df.count_lines)
     mean_times = []
@@ -122,19 +138,50 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
     if trend == 'linear':
         z = np.polyfit(unique_count_lines, mean_times, 1)
         p = np.poly1d(z)
-        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.')
+        plt.plot(
+            unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.'
+        )
     elif trend == 'n^2':
-        popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100]))
+        popt_cons, _ = curve_fit(
+            square_func,
+            unique_count_lines,
+            mean_times,
+            bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])
+        )
         p = np.poly1d(popt_cons)
-        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.')
+        plt.plot(
+            unique_count_lines,
+            p(unique_count_lines),
+            "r--", label='Квадратичный тренд.'
+        )
     elif trend == 'n^3':
-        popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
+        popt_cons, _ = curve_fit(
+            cube_func,
+            unique_count_lines,
+            mean_times,
+            bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])
+        )
         p = np.poly1d(popt_cons)
-        plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.')
+        plt.plot(
+            unique_count_lines,
+            p(unique_count_lines),
+            "r--",
+            label='Кубический тренд.'
+        )
     elif trend == 'n^4':
-        popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
+        popt_cons, _ = curve_fit(
+            quart_func,
+            unique_count_lines,
+            mean_times,
+            bounds=(
+                [-np.inf, 0., 0., 0., 0.],
+                [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]
+            )
+        )
         p = np.poly1d(popt_cons)
         plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
+    else:
+        raise Exception(f"Incorrect tred '{trend}'.")
 
     rolling = pd.DataFrame(
         {
@@ -143,24 +190,40 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
         }
     )
     num_window = 20
-    plt.plot(rolling.unique_count_lines, rolling.mean_times.rolling(window=num_window).mean(), label=f'Скользящее среднее по {num_window}ти замерам.')
+    plt.plot(
+        rolling.unique_count_lines,
+        rolling.mean_times.rolling(window=num_window).mean(),
+        label=f'Скользящее среднее по {num_window}ти замерам.'
+    )
 
     plt.ylabel(ylabel, fontsize=15)
     plt.xlabel(xlabel, fontsize=15)
     plt.title(title, fontsize=17)
     plt.legend(loc='upper left')
-    plt.savefig('./graphics/need_time_{}_{}.png'.format(what, datetime.now().strftime("%d%m%Y_%H%M%S")))
+    plt.savefig(
+        './graphics/need_time_{}_{}.png'.format(
+            what,
+            datetime.now().strftime("%d%m%Y_%H%M%S")
+        )
+    )
 
 
-def get_time_algorithms(df, work, iterations=5, metric='fast'):
+def get_time_algorithms(
+    df: pd.DataFrame,
+    work,
+    iterations: int = 5,
+    metric: Literal['fast', 'gst', 'structure'] = 'fast'
+) -> pd.DataFrame:
     count_lines = []
     times = []
     tree1 = get_ast_from_content(work.content, work.link)
     if tree1 is None:
         raise Exception("Unexpected error when parsing first work.")
 
     features1 = get_features_from_ast(tree1, work.link)
-    for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
+    for (index, content) in df[
+        ['content', 'link', 'count_lines_without_blank_lines']
+    ].iterrows():
         for _ in range(iterations):
             print(index, " " * 20, end='\r')
             tree2 = get_ast_from_content(content[0], content[1])
@@ -190,8 +253,7 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
                 end = perf_counter() - start
                 times.append(end)
             else:
-                print('Incorrect metric!')
-                return 1
+                raise Exception('Incorrect metric!')
 
             count_lines.append(content[2])
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,22 @@
-[tool.isort]
-profile = "black"
+[project]
+requires-python = ">=3.8"
 
-[tool.black]
-target-version = ["py38"]
+[tool.ruff]
+line-length = 97
+select = [
+    "F",    # pyflakes
+    "E",    # pycodestyle Error
+    "W",    # pycodestyle Warning
+    "I",    # isort
+    "B",    # flake8-bugbear
+    "C4",   # flake8-comprehensions
+    "SIM",  # flake8-simplify
+    # "ERA",  # eradicate
+    "C90",  # mccabe
+]
+
+[tool.ruff.mccabe]
+max-complexity = 13
 
 [tool.pyright]
 pythonVersion = "3.8"

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 INSTALL_REQUIREMENTS = [
     'argcomplete~=2.0.0',
-    'numpy~=1.23.1',
+    'numpy~=1.23.5',
     'pandas~=1.4.3',
     'ccsyspath~=1.1.0',
     'clang~=16.0.1.1',

diff --git a/src/codeplag/algorithms/featurebased.py b/src/codeplag/algorithms/featurebased.py
@@ -17,15 +17,15 @@ def counter_metric(counter1: dict, counter2: dict) -> float:
         return 1.0
 
     percent_of_same = [0, 0]
-    for key in counter1.keys():
+    for key in counter1:
         if key not in counter2:
             percent_of_same[1] += counter1[key]
             continue
         percent_of_same[0] += min(counter1[key],
                                   counter2[key])
         percent_of_same[1] += max(counter1[key],
                                   counter2[key])
-    for key in counter2.keys():
+    for key in counter2:
         if key not in counter1:
             percent_of_same[1] += counter2[key]
             continue

diff --git a/src/codeplag/codeplagcli.py b/src/codeplag/codeplagcli.py
@@ -6,9 +6,16 @@
 from pathlib import Path
 from typing import List, Optional
 
-from codeplag.consts import EXTENSION_CHOICE, MODE_CHOICE, UTIL_NAME, UTIL_VERSION
 from webparsers.types import GitHubContentUrl
 
+from codeplag.consts import (
+    EXTENSION_CHOICE,
+    MODE_CHOICE,
+    REPORTS_EXTENSION_CHOICE,
+    UTIL_NAME,
+    UTIL_VERSION,
+)
+
 
 class CheckUniqueStore(argparse.Action):
     """Checks that the list of arguments contains no duplicates, then stores"""
@@ -116,6 +123,13 @@ def __init__(self):
             metavar="DIRECTORY",
             type=DirPath,
         )
+        settings_modify.add_argument(
+            "-re",
+            "--reports_extension",
+            help="Extension of saved report files.",
+            type=str,
+            choices=REPORTS_EXTENSION_CHOICE
+        )
         settings_modify.add_argument(
             "-sp",
             "--show_progress",