Skip to content

Commit

Permalink
feat: Show in progress time spent, predicted time left, and count of …
Browse files Browse the repository at this point in the history
…workers. Also adds the possibility to change the number of workers.
  • Loading branch information
Artanias committed Mar 31, 2024
1 parent de098f6 commit e802258
Show file tree
Hide file tree
Showing 12 changed files with 80 additions and 44 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
rev: v0.3.4
hooks:
- id: ruff
- repo: local
Expand Down
1 change: 0 additions & 1 deletion src/codeplag/algorithms/tokenbased.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
of two token sequences.
"""


import math
from typing import List, Literal, Sequence, Set, Tuple, Union, overload

Expand Down
9 changes: 9 additions & 0 deletions src/codeplag/codeplagcli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module consist the CLI of the codeplag util and
necessary internal classes for it.
"""

import argparse
from pathlib import Path
from typing import List, Optional
Expand All @@ -16,6 +17,7 @@
REPORTS_EXTENSION_CHOICE,
UTIL_NAME,
UTIL_VERSION,
WORKERS_CHOICE,
)


Expand Down Expand Up @@ -134,6 +136,13 @@ def __add_settings_path(self, subparsers: argparse._SubParsersAction) -> None:
type=str,
choices=LANGUAGE_CHOICE,
)
settings_modify.add_argument(
"-w",
"--workers",
help="The maximum number of processes that can be used to compare works.",
type=int,
choices=WORKERS_CHOICE,
)

# settings show
settings_commands.add_parser(
Expand Down
2 changes: 2 additions & 0 deletions src/codeplag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_LANGUAGE,
DEFAULT_REPORT_EXTENSION,
DEFAULT_THRESHOLD,
DEFAULT_WORKERS,
)
from codeplag.logger import codeplag_logger as logger
from codeplag.types import Settings
Expand Down Expand Up @@ -85,4 +86,5 @@ def write_settings_conf(settings: Settings) -> None:
show_progress=0,
reports_extension=DEFAULT_REPORT_EXTENSION,
language=DEFAULT_LANGUAGE,
workers=DEFAULT_WORKERS,
)
20 changes: 15 additions & 5 deletions src/codeplag/consts.tmp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
from pathlib import Path
from typing import Dict, Final, List, Tuple
from typing import Dict, Final, List, Tuple, get_args

from codeplag.types import (
Extension,
Expand All @@ -22,13 +23,16 @@
"ru": Path("@LIB_PATH@/report_ru.templ"),
"en": Path("@LIB_PATH@/report_en.templ"),
}
# =====

# Default values
DEFAULT_THRESHOLD: Final[Threshold] = 65
DEFAULT_WEIGHTS: Final[Tuple[float, float, float, float]] = (1.0, 0.4, 0.4, 0.4)
DEFAULT_LANGUAGE: Final[Language] = "en"
DEFAULT_REPORT_EXTENSION: Final[ReportsExtension] = "csv"
DEFAULT_GENERAL_REPORT_NAME: Final[str] = "report.html"
DEFAULT_WORKERS: Final[int] = os.cpu_count() or 1
# =============

GET_FRAZE: Final[str] = "Getting works features from"

Expand All @@ -52,10 +56,16 @@
"compliance_matrix",
)

MODE_CHOICE: Final[List[Mode]] = ["many_to_many", "one_to_one"]
REPORTS_EXTENSION_CHOICE: Final[List[ReportsExtension]] = ["csv", "json"]
EXTENSION_CHOICE: Final[List[Extension]] = ["py", "cpp"]
LANGUAGE_CHOICE: Final[List[Language]] = ["en", "ru"]
# Choices
MODE_CHOICE: Final[Tuple[Mode, ...]] = get_args(Mode)
REPORTS_EXTENSION_CHOICE: Final[Tuple[ReportsExtension, ...]] = get_args(
ReportsExtension
)
EXTENSION_CHOICE: Final[Tuple[Extension, ...]] = get_args(Extension)
LANGUAGE_CHOICE: Final[Tuple[Language, ...]] = get_args(Language)
WORKERS_CHOICE: Final[List[int]] = list(range(1, DEFAULT_WORKERS + 1))
# =======

ALL_EXTENSIONS: Final[Tuple[re.Pattern]] = (re.compile(r"\..*$"),)
# Don't checks changing values by key
SUPPORTED_EXTENSIONS: Final[Dict[Extension, Extensions]] = {
Expand Down
7 changes: 1 addition & 6 deletions src/codeplag/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def __str__(self) -> str:
class ComplexProgress(Progress):
def __init__(self, iterations: int) -> None:
super(ComplexProgress, self).__init__(iterations)
self.__internal_progresses: list[Progress] = []
self.__internal_progresses: List[Progress] = []

def add_internal_progress(self, internal_iterations: int) -> None:
if len(self.__internal_progresses) == self.iterations:
Expand Down Expand Up @@ -195,8 +195,3 @@ def __next__(self) -> float:
continue
break
return self.progress


def print_progress_and_increase(progress: Progress) -> None:
print(f"{progress}.", end="\r")
next(progress)
1 change: 1 addition & 0 deletions src/codeplag/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class Settings(TypedDict):
reports_extension: ReportsExtension
show_progress: Flag
threshold: Threshold
workers: int


class SameHead(NamedTuple):
Expand Down
72 changes: 43 additions & 29 deletions src/codeplag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import uuid
from concurrent.futures import Future, ProcessPoolExecutor
from datetime import datetime
from datetime import datetime, timedelta
from itertools import combinations
from pathlib import Path
from time import monotonic, perf_counter
Expand Down Expand Up @@ -37,7 +37,6 @@
ComplexProgress,
Progress,
print_compare_result,
print_progress_and_increase,
)
from codeplag.getfeatures import AbstractGetter
from codeplag.pyplag.utils import PyFeaturesGetter
Expand Down Expand Up @@ -321,6 +320,7 @@ def read_df(path: Path) -> pd.DataFrame:
return pd.read_csv(path, sep=";", index_col=0, dtype=object)


# TODO: Split this disaster class into smaller class and functions
class CodeplagEngine:
def __init__(self, logger: logging.Logger, parsed_args: Dict[str, Any]) -> None:
self.root: str = parsed_args.pop("root")
Expand All @@ -341,15 +341,18 @@ def __init__(self, logger: logging.Logger, parsed_args: Dict[str, Any]) -> None:
"reports_extension"
)
self.language: Language = parsed_args.pop("language")
self.workers: int = parsed_args.pop("workers")
elif self.root == "report":
self.path: Path = parsed_args.pop("path")
else:
settings_conf = read_settings_conf()
self._set_features_getter(parsed_args)

self.progress: Optional[Progress] = None
self.mode: Mode = parsed_args.pop("mode", "many_to_many")
self.show_progress: Flag = settings_conf["show_progress"]
self.threshold: int = settings_conf["threshold"]
self.workers: int = settings_conf["workers"]

self.reports: Optional[Path] = settings_conf.get("reports")
self.__reports_extension: ReportsExtension = settings_conf[
Expand Down Expand Up @@ -531,17 +534,37 @@ def _create_future_compare(
) -> Future:
return executor.submit(compare_works, work1, work2, self.threshold)

def __print_pretty_progress_and_increase(self) -> None:
if self.progress is None:
return
time_spent_seconds = monotonic() - self.begin_time
time_spent = timedelta(seconds=int(time_spent_seconds))
current_progress = self.progress.progress
if current_progress != 0.0:
predicated_time_left = timedelta(
seconds=int(
(1.0 - current_progress) / current_progress * time_spent_seconds
)
)
else:
predicated_time_left = "N/A"
print(
f"{self.progress}, "
f"{time_spent} time spent [predicted time left {predicated_time_left}], "
f"{self.workers} workers",
end="\r",
)
next(self.progress)

def _do_step(
self,
executor: ProcessPoolExecutor,
processing: List[ProcessingWorksInfo],
work1: ASTFeatures,
work2: ASTFeatures,
progress: Optional[Progress],
) -> None:
if work1.filepath == work2.filepath:
if progress is not None:
print_progress_and_increase(progress)
self.__print_pretty_progress_and_increase()
return

work1, work2 = sorted([work1, work2])
Expand All @@ -554,8 +577,7 @@ def _do_step(
)
return
self._handle_compare_result(work1, work2, metrics)
if progress is not None:
print_progress_and_increase(progress)
self.__print_pretty_progress_and_increase()

def _handle_compare_result(
self,
Expand Down Expand Up @@ -586,15 +608,13 @@ def _handle_compare_result(
def _handle_completed_futures(
self,
processing: List[ProcessingWorksInfo],
progress: Optional[Progress],
):
for proc_works_info in processing:
metrics: CompareInfo = proc_works_info.compare_future.result()
self._handle_compare_result(
proc_works_info.work1, proc_works_info.work2, metrics, save=True
)
if progress is not None:
print_progress_and_increase(progress)
self.__print_pretty_progress_and_increase()

def _settings_show(self) -> None:
settings_config = read_settings_conf()
Expand Down Expand Up @@ -633,19 +653,17 @@ def __many_to_many_check(
)
works.extend(self.features_getter.get_from_users_repos(self.github_user))

progress = None
if self.show_progress:
count_works = len(works)
progress = Progress(calc_iterations(count_works))

with ProcessPoolExecutor() as executor:
self.progress = Progress(calc_iterations(count_works))
with ProcessPoolExecutor(max_workers=self.workers) as executor:
processed: List[ProcessingWorksInfo] = []
for i, work1 in enumerate(works):
for j, work2 in enumerate(works):
if i <= j:
continue
self._do_step(executor, processed, work1, work2, progress)
self._handle_completed_futures(processed, progress)
self._do_step(executor, processed, work1, work2)
self._handle_completed_futures(processed)

def __one_to_one_check(
self,
Expand All @@ -666,34 +684,30 @@ def __one_to_one_check(
),
),
)
complex_progress = None
if self.show_progress:
combined_elements = list(combined_elements)
count_sequences = len(combined_elements)
complex_progress = ComplexProgress(
calc_iterations(count_sequences, self.mode)
)
self.progress = ComplexProgress(calc_iterations(count_sequences, self.mode))
cases = combinations(combined_elements, r=2)
with ProcessPoolExecutor() as executor:
with ProcessPoolExecutor(max_workers=self.workers) as executor:
processed: List[ProcessingWorksInfo] = []
for case in cases:
first_sequence, second_sequence = case
if complex_progress is not None:
complex_progress.add_internal_progress(
if self.progress is not None:
assert isinstance(self.progress, ComplexProgress)
self.progress.add_internal_progress(
len(first_sequence) * len(second_sequence)
)
for work1 in first_sequence:
for work2 in second_sequence:
self._do_step(
executor, processed, work1, work2, complex_progress
)
self._handle_completed_futures(processed, complex_progress)
self._do_step(executor, processed, work1, work2)
self._handle_completed_futures(processed)

def __check(self) -> None:
self.logger.debug(
f"Mode: {self.mode}; Extension: {self.features_getter.extension}."
)
begin_time = monotonic()
self.begin_time = monotonic()
features_from_files = self.features_getter.get_from_files(self.files)
features_from_gh_files = self.features_getter.get_from_github_files(
self.github_files
Expand All @@ -704,7 +718,7 @@ def __check(self) -> None:
self.__many_to_many_check(features_from_files, features_from_gh_files)
elif self.mode == "one_to_one":
self.__one_to_one_check(features_from_files, features_from_gh_files)
self.logger.debug(f"Time for all {monotonic() - begin_time:.2f}s")
self.logger.debug(f"Time for all {monotonic() - self.begin_time:.2f}s")
self.logger.info("Ending searching for plagiarism ...")

def _report_create(self) -> Literal[0, 1]:
Expand Down
2 changes: 1 addition & 1 deletion src/webparsers/async_github_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ async def get_files_generator_from_sha_commit(
response: Dict[str, Any] = await self.send_get_request(
self.GIT_TREE, {"username": owner, "repo": repo, "sha": sha}
)
tree: list[Dict[str, Any]] = response["tree"]
tree: List[Dict[str, Any]] = response["tree"]
for node in tree:
current_path = f"{path}/{node['path']}"
full_link = f"{_GH_URL}{owner}/{repo}/blob/{branch.name}{current_path}"
Expand Down
2 changes: 1 addition & 1 deletion src/webparsers/github_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def get_files_generator_from_sha_commit(
) -> Iterator[WorkInfo]:
api_url = f"/repos/{owner}/{repo}/git/trees/{sha}"
jresponse: Dict[str, Any] = self.send_get_request(api_url).json()
tree: list[Dict[str, Any]] = jresponse["tree"]
tree: List[Dict[str, Any]] = jresponse["tree"]
for node in tree:
current_path = f"{path}/{node['path']}"
full_link = f"{_GH_URL}{owner}/{repo}/blob/{branch.name}{current_path}"
Expand Down
5 changes: 5 additions & 0 deletions test/unit/codeplag/test_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import os
from pathlib import Path
from typing import Optional
from unittest.mock import MagicMock
Expand Down Expand Up @@ -115,6 +116,7 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
"show_progress": 0,
"reports_extension": "csv",
"language": "en",
"workers": os.cpu_count() or 1,
},
],
[
Expand All @@ -124,13 +126,15 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
"show_progress": 1,
"reports_extension": "json",
"language": "ru",
"workers": 128,
},
{
"threshold": 99,
"environment": Path("/home/bukabyka/.env"),
"show_progress": 1,
"reports_extension": "json",
"language": "ru",
"workers": 128,
},
],
[
Expand All @@ -141,6 +145,7 @@ def test_read_default_settings_conf(settings_config: Optional[Settings]):
"show_progress": 0,
"reports_extension": "csv",
"language": "en",
"workers": os.cpu_count() or 1,
},
],
],
Expand Down
1 change: 1 addition & 0 deletions test/unit/codeplag/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ def test_save_result_to_csv(
show_progress=0,
threshold=65,
language="en",
workers=1,
)
code_engine = CodeplagEngine(mock_default_logger, parsed_args)

Expand Down

0 comments on commit e802258

Please sign in to comment.