feat: Show in progress time spent, predicted time left, and count of …

…workers. Also adds the possibility to change the number of workers.
OSLL · Mar 31, 2024 · e802258 · e802258
1 parent de098f6
commit e802258
Show file tree

Hide file tree

Showing 12 changed files with 80 additions and 44 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
     hooks:
       - id: black
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.287
+    rev: v0.3.4
     hooks:
       - id: ruff
   - repo: local

diff --git a/src/codeplag/algorithms/tokenbased.py b/src/codeplag/algorithms/tokenbased.py
@@ -6,7 +6,6 @@
 of two token sequences.
 """
 
-
 import math
 from typing import List, Literal, Sequence, Set, Tuple, Union, overload
 

diff --git a/src/codeplag/codeplagcli.py b/src/codeplag/codeplagcli.py
@@ -2,6 +2,7 @@
 This module consist the CLI of the codeplag util and
 necessary internal classes for it.
 """
+
 import argparse
 from pathlib import Path
 from typing import List, Optional
@@ -16,6 +17,7 @@
     REPORTS_EXTENSION_CHOICE,
     UTIL_NAME,
     UTIL_VERSION,
+    WORKERS_CHOICE,
 )
 
 
@@ -134,6 +136,13 @@ def __add_settings_path(self, subparsers: argparse._SubParsersAction) -> None:
             type=str,
             choices=LANGUAGE_CHOICE,
         )
+        settings_modify.add_argument(
+            "-w",
+            "--workers",
+            help="The maximum number of processes that can be used to compare works.",
+            type=int,
+            choices=WORKERS_CHOICE,
+        )
 
         # settings show
         settings_commands.add_parser(

diff --git a/src/codeplag/config.py b/src/codeplag/config.py
@@ -9,6 +9,7 @@
     DEFAULT_LANGUAGE,
     DEFAULT_REPORT_EXTENSION,
     DEFAULT_THRESHOLD,
+    DEFAULT_WORKERS,
 )
 from codeplag.logger import codeplag_logger as logger
 from codeplag.types import Settings
@@ -85,4 +86,5 @@ def write_settings_conf(settings: Settings) -> None:
     show_progress=0,
     reports_extension=DEFAULT_REPORT_EXTENSION,
     language=DEFAULT_LANGUAGE,
+    workers=DEFAULT_WORKERS,
 )
diff --git a/src/codeplag/consts.tmp.py b/src/codeplag/consts.tmp.py
@@ -1,6 +1,7 @@
+import os
 import re
 from pathlib import Path
-from typing import Dict, Final, List, Tuple
+from typing import Dict, Final, List, Tuple, get_args
 
 from codeplag.types import (
     Extension,
@@ -22,13 +23,16 @@
     "ru": Path("@LIB_PATH@/report_ru.templ"),
     "en": Path("@LIB_PATH@/report_en.templ"),
 }
+# =====
 
 # Default values
 DEFAULT_THRESHOLD: Final[Threshold] = 65
 DEFAULT_WEIGHTS: Final[Tuple[float, float, float, float]] = (1.0, 0.4, 0.4, 0.4)
 DEFAULT_LANGUAGE: Final[Language] = "en"
 DEFAULT_REPORT_EXTENSION: Final[ReportsExtension] = "csv"
 DEFAULT_GENERAL_REPORT_NAME: Final[str] = "report.html"
+DEFAULT_WORKERS: Final[int] = os.cpu_count() or 1
+# =============
 
 GET_FRAZE: Final[str] = "Getting works features from"
 
@@ -52,10 +56,16 @@
     "compliance_matrix",
 )
 
-MODE_CHOICE: Final[List[Mode]] = ["many_to_many", "one_to_one"]
-REPORTS_EXTENSION_CHOICE: Final[List[ReportsExtension]] = ["csv", "json"]
-EXTENSION_CHOICE: Final[List[Extension]] = ["py", "cpp"]
-LANGUAGE_CHOICE: Final[List[Language]] = ["en", "ru"]
+# Choices
+MODE_CHOICE: Final[Tuple[Mode, ...]] = get_args(Mode)
+REPORTS_EXTENSION_CHOICE: Final[Tuple[ReportsExtension, ...]] = get_args(
+    ReportsExtension
+)
+EXTENSION_CHOICE: Final[Tuple[Extension, ...]] = get_args(Extension)
+LANGUAGE_CHOICE: Final[Tuple[Language, ...]] = get_args(Language)
+WORKERS_CHOICE: Final[List[int]] = list(range(1, DEFAULT_WORKERS + 1))
+# =======
+
 ALL_EXTENSIONS: Final[Tuple[re.Pattern]] = (re.compile(r"\..*$"),)
 # Don't  checks changing values by key
 SUPPORTED_EXTENSIONS: Final[Dict[Extension, Extensions]] = {

diff --git a/src/codeplag/display.py b/src/codeplag/display.py
@@ -167,7 +167,7 @@ def __str__(self) -> str:
 class ComplexProgress(Progress):
     def __init__(self, iterations: int) -> None:
         super(ComplexProgress, self).__init__(iterations)
-        self.__internal_progresses: list[Progress] = []
+        self.__internal_progresses: List[Progress] = []
 
     def add_internal_progress(self, internal_iterations: int) -> None:
         if len(self.__internal_progresses) == self.iterations:
@@ -195,8 +195,3 @@ def __next__(self) -> float:
                 continue
             break
         return self.progress
-
-
-def print_progress_and_increase(progress: Progress) -> None:
-    print(f"{progress}.", end="\r")
-    next(progress)
diff --git a/src/codeplag/types.py b/src/codeplag/types.py
@@ -143,6 +143,7 @@ class Settings(TypedDict):
     reports_extension: ReportsExtension
     show_progress: Flag
     threshold: Threshold
+    workers: int
 
 
 class SameHead(NamedTuple):

diff --git a/src/codeplag/utils.py b/src/codeplag/utils.py
@@ -4,7 +4,7 @@
 import os
 import uuid
 from concurrent.futures import Future, ProcessPoolExecutor
-from datetime import datetime
+from datetime import datetime, timedelta
 from itertools import combinations
 from pathlib import Path
 from time import monotonic, perf_counter
@@ -37,7 +37,6 @@
     ComplexProgress,
     Progress,
     print_compare_result,
-    print_progress_and_increase,
 )
 from codeplag.getfeatures import AbstractGetter
 from codeplag.pyplag.utils import PyFeaturesGetter
@@ -321,6 +320,7 @@ def read_df(path: Path) -> pd.DataFrame:
     return pd.read_csv(path, sep=";", index_col=0, dtype=object)
 
 
+# TODO: Split this disaster class into smaller class and functions
 class CodeplagEngine:
     def __init__(self, logger: logging.Logger, parsed_args: Dict[str, Any]) -> None:
         self.root: str = parsed_args.pop("root")
@@ -341,15 +341,18 @@ def __init__(self, logger: logging.Logger, parsed_args: Dict[str, Any]) -> None:
                 "reports_extension"
             )
             self.language: Language = parsed_args.pop("language")
+            self.workers: int = parsed_args.pop("workers")
         elif self.root == "report":
             self.path: Path = parsed_args.pop("path")
         else:
             settings_conf = read_settings_conf()
             self._set_features_getter(parsed_args)
 
+            self.progress: Optional[Progress] = None
             self.mode: Mode = parsed_args.pop("mode", "many_to_many")
             self.show_progress: Flag = settings_conf["show_progress"]
             self.threshold: int = settings_conf["threshold"]
+            self.workers: int = settings_conf["workers"]
 
             self.reports: Optional[Path] = settings_conf.get("reports")
             self.__reports_extension: ReportsExtension = settings_conf[
@@ -531,17 +534,37 @@ def _create_future_compare(
     ) -> Future:
         return executor.submit(compare_works, work1, work2, self.threshold)
 
+    def __print_pretty_progress_and_increase(self) -> None:
+        if self.progress is None:
+            return
+        time_spent_seconds = monotonic() - self.begin_time
+        time_spent = timedelta(seconds=int(time_spent_seconds))
+        current_progress = self.progress.progress
+        if current_progress != 0.0:
+            predicated_time_left = timedelta(
+                seconds=int(
+                    (1.0 - current_progress) / current_progress * time_spent_seconds
+                )
+            )
+        else:
+            predicated_time_left = "N/A"
+        print(
+            f"{self.progress}, "
+            f"{time_spent} time spent [predicted time left {predicated_time_left}], "
+            f"{self.workers} workers",
+            end="\r",
+        )
+        next(self.progress)
+
     def _do_step(
         self,
         executor: ProcessPoolExecutor,
         processing: List[ProcessingWorksInfo],
         work1: ASTFeatures,
         work2: ASTFeatures,
-        progress: Optional[Progress],
     ) -> None:
         if work1.filepath == work2.filepath:
-            if progress is not None:
-                print_progress_and_increase(progress)
+            self.__print_pretty_progress_and_increase()
             return
 
         work1, work2 = sorted([work1, work2])
@@ -554,8 +577,7 @@ def _do_step(
             )
             return
         self._handle_compare_result(work1, work2, metrics)
-        if progress is not None:
-            print_progress_and_increase(progress)
+        self.__print_pretty_progress_and_increase()
 
     def _handle_compare_result(
         self,
@@ -586,15 +608,13 @@ def _handle_compare_result(
     def _handle_completed_futures(
         self,
         processing: List[ProcessingWorksInfo],
-        progress: Optional[Progress],
     ):
         for proc_works_info in processing:
             metrics: CompareInfo = proc_works_info.compare_future.result()
             self._handle_compare_result(
                 proc_works_info.work1, proc_works_info.work2, metrics, save=True
             )
-            if progress is not None:
-                print_progress_and_increase(progress)
+            self.__print_pretty_progress_and_increase()
 
     def _settings_show(self) -> None:
         settings_config = read_settings_conf()
@@ -633,19 +653,17 @@ def __many_to_many_check(
         )
         works.extend(self.features_getter.get_from_users_repos(self.github_user))
 
-        progress = None
         if self.show_progress:
             count_works = len(works)
-            progress = Progress(calc_iterations(count_works))
-
-        with ProcessPoolExecutor() as executor:
+            self.progress = Progress(calc_iterations(count_works))
+        with ProcessPoolExecutor(max_workers=self.workers) as executor:
             processed: List[ProcessingWorksInfo] = []
             for i, work1 in enumerate(works):
                 for j, work2 in enumerate(works):
                     if i <= j:
                         continue
-                    self._do_step(executor, processed, work1, work2, progress)
-            self._handle_completed_futures(processed, progress)
+                    self._do_step(executor, processed, work1, work2)
+            self._handle_completed_futures(processed)
 
     def __one_to_one_check(
         self,
@@ -666,34 +684,30 @@ def __one_to_one_check(
                 ),
             ),
         )
-        complex_progress = None
         if self.show_progress:
             combined_elements = list(combined_elements)
             count_sequences = len(combined_elements)
-            complex_progress = ComplexProgress(
-                calc_iterations(count_sequences, self.mode)
-            )
+            self.progress = ComplexProgress(calc_iterations(count_sequences, self.mode))
         cases = combinations(combined_elements, r=2)
-        with ProcessPoolExecutor() as executor:
+        with ProcessPoolExecutor(max_workers=self.workers) as executor:
             processed: List[ProcessingWorksInfo] = []
             for case in cases:
                 first_sequence, second_sequence = case
-                if complex_progress is not None:
-                    complex_progress.add_internal_progress(
+                if self.progress is not None:
+                    assert isinstance(self.progress, ComplexProgress)
+                    self.progress.add_internal_progress(
                         len(first_sequence) * len(second_sequence)
                     )
                 for work1 in first_sequence:
                     for work2 in second_sequence:
-                        self._do_step(
-                            executor, processed, work1, work2, complex_progress
-                        )
-            self._handle_completed_futures(processed, complex_progress)
+                        self._do_step(executor, processed, work1, work2)
+            self._handle_completed_futures(processed)
 
     def __check(self) -> None:
         self.logger.debug(
             f"Mode: {self.mode}; Extension: {self.features_getter.extension}."
         )
-        begin_time = monotonic()
+        self.begin_time = monotonic()
         features_from_files = self.features_getter.get_from_files(self.files)
         features_from_gh_files = self.features_getter.get_from_github_files(
             self.github_files
@@ -704,7 +718,7 @@ def __check(self) -> None:
             self.__many_to_many_check(features_from_files, features_from_gh_files)
         elif self.mode == "one_to_one":
             self.__one_to_one_check(features_from_files, features_from_gh_files)
-        self.logger.debug(f"Time for all {monotonic() - begin_time:.2f}s")
+        self.logger.debug(f"Time for all {monotonic() - self.begin_time:.2f}s")
         self.logger.info("Ending searching for plagiarism ...")
 
     def _report_create(self) -> Literal[0, 1]:

diff --git a/src/webparsers/async_github_parser.py b/src/webparsers/async_github_parser.py
@@ -253,7 +253,7 @@ async def get_files_generator_from_sha_commit(
         response: Dict[str, Any] = await self.send_get_request(
             self.GIT_TREE, {"username": owner, "repo": repo, "sha": sha}
         )
-        tree: list[Dict[str, Any]] = response["tree"]
+        tree: List[Dict[str, Any]] = response["tree"]
         for node in tree:
             current_path = f"{path}/{node['path']}"
             full_link = f"{_GH_URL}{owner}/{repo}/blob/{branch.name}{current_path}"

diff --git a/src/webparsers/github_parser.py b/src/webparsers/github_parser.py
@@ -201,7 +201,7 @@ def get_files_generator_from_sha_commit(
     ) -> Iterator[WorkInfo]:
         api_url = f"/repos/{owner}/{repo}/git/trees/{sha}"
         jresponse: Dict[str, Any] = self.send_get_request(api_url).json()
-        tree: list[Dict[str, Any]] = jresponse["tree"]
+        tree: List[Dict[str, Any]] = jresponse["tree"]
         for node in tree:
             current_path = f"{path}/{node['path']}"
             full_link = f"{_GH_URL}{owner}/{repo}/blob/{branch.name}{current_path}"

diff --git a/test/unit/codeplag/test_config.py b/test/unit/codeplag/test_config.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import os
 from pathlib import Path
 from typing import Optional
 from unittest.mock import MagicMock
@@ -115,6 +116,7 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
                 "show_progress": 0,
                 "reports_extension": "csv",
                 "language": "en",
+                "workers": os.cpu_count() or 1,
             },
         ],
         [
@@ -124,13 +126,15 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
                 "show_progress": 1,
                 "reports_extension": "json",
                 "language": "ru",
+                "workers": 128,
             },
             {
                 "threshold": 99,
                 "environment": Path("/home/bukabyka/.env"),
                 "show_progress": 1,
                 "reports_extension": "json",
                 "language": "ru",
+                "workers": 128,
             },
         ],
         [
@@ -141,6 +145,7 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
                 "show_progress": 0,
                 "reports_extension": "csv",
                 "language": "en",
+                "workers": os.cpu_count() or 1,
             },
         ],
     ],

diff --git a/test/unit/codeplag/test_utils.py b/test/unit/codeplag/test_utils.py
@@ -286,6 +286,7 @@ def test_save_result_to_csv(
         show_progress=0,
         threshold=65,
         language="en",
+        workers=1,
     )
     code_engine = CodeplagEngine(mock_default_logger, parsed_args)
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,6 @@ @@
     of two token sequences.
     """
     import math
     from typing import List, Literal, Sequence, Set, Tuple, Union, overload
@@ Expand Down @@