Skip to content

Commit

Permalink
fix: the progress indication for many workers has been reworked.
Browse files Browse the repository at this point in the history
  • Loading branch information
Artanias committed Mar 30, 2024
1 parent 762cfe5 commit de098f6
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 118 deletions.
96 changes: 67 additions & 29 deletions src/codeplag/display.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from enum import Enum
from functools import partial
from typing import List, Optional
from typing import Final, List, Optional

import pandas as pd
from typing_extensions import Self

from codeplag.types import ASTFeatures, CompareInfo, NodeCodePlace

Expand Down Expand Up @@ -133,32 +134,69 @@ def print_compare_result(
print("+" * 40)


def calc_and_print_progress(
iteration: int,
iterations: int,
internal_iteration: int = 0,
internal_iterations: int = 0,
) -> None:
progress = _calc_progress(
iteration, iterations, internal_iteration, internal_iterations
)
print(f"Check progress: {progress:.2%}.", end="\r")


def _calc_progress(
iteration: int,
iterations: int,
internal_iteration: int = 0,
internal_iterations: int = 0,
) -> float:
if iterations == 0:
return 0.0

progress = iteration / iterations
if internal_iterations == 0:
return progress

if internal_iteration * internal_iterations:
progress += internal_iteration / (internal_iterations * iterations)
class Progress:
def __init__(self, iterations: int) -> None:
self.__iterations: Final[int] = iterations
self.__iteration: int = -1

@property
def progress(self) -> float:
if self.iterations == 0:
return 1.0
if self.__iteration <= 0:
return 0.0
return self.__iteration / self.iterations

@property
def iterations(self) -> int:
return self.__iterations if self.__iterations > 0 else 0

def __iter__(self) -> Self:
return self

def __next__(self) -> float:
if self.progress == 1.0:
raise StopIteration("The progress has already been completed.")
self.__iteration += 1
return self.progress

def __str__(self) -> str:
return f"Progress: {self.progress:.2%}"


class ComplexProgress(Progress):
def __init__(self, iterations: int) -> None:
super(ComplexProgress, self).__init__(iterations)
self.__internal_progresses: list[Progress] = []

def add_internal_progress(self, internal_iterations: int) -> None:
if len(self.__internal_progresses) == self.iterations:
raise IndexError("The internal iteration count limit was exceeded.")
self.__internal_progresses.append(Progress(internal_iterations))

@property
def progress(self) -> float:
if self.iterations == 0:
return 1.0
return float(
sum(
internal_progress.progress / self.iterations
for internal_progress in self.__internal_progresses
)
)

return progress
def __next__(self) -> float:
if self.progress == 1.0:
raise StopIteration("The progress has already been completed.")
for internal_progress in self.__internal_progresses:
if internal_progress.progress == 1.0:
continue
if next(internal_progress) == 1.0:
continue
break
return self.progress


def print_progress_and_increase(progress: Progress) -> None:
print(f"{progress}.", end="\r")
next(progress)
176 changes: 96 additions & 80 deletions src/codeplag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datetime import datetime
from itertools import combinations
from pathlib import Path
from time import perf_counter, time
from time import monotonic, perf_counter
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple

import jinja2
Expand All @@ -33,7 +33,12 @@
TEMPLATE_PATH,
)
from codeplag.cplag.utils import CFeaturesGetter
from codeplag.display import calc_and_print_progress, print_compare_result
from codeplag.display import (
ComplexProgress,
Progress,
print_compare_result,
print_progress_and_increase,
)
from codeplag.getfeatures import AbstractGetter
from codeplag.pyplag.utils import PyFeaturesGetter
from codeplag.types import (
Expand Down Expand Up @@ -342,7 +347,7 @@ def __init__(self, logger: logging.Logger, parsed_args: Dict[str, Any]) -> None:
settings_conf = read_settings_conf()
self._set_features_getter(parsed_args)

self.mode: str = parsed_args.pop("mode", "many_to_many")
self.mode: Mode = parsed_args.pop("mode", "many_to_many")
self.show_progress: Flag = settings_conf["show_progress"]
self.threshold: int = settings_conf["threshold"]

Expand Down Expand Up @@ -532,8 +537,11 @@ def _do_step(
processing: List[ProcessingWorksInfo],
work1: ASTFeatures,
work2: ASTFeatures,
progress: Optional[Progress],
) -> None:
if work1.filepath == work2.filepath:
if progress is not None:
print_progress_and_increase(progress)
return

work1, work2 = sorted([work1, work2])
Expand All @@ -546,6 +554,8 @@ def _do_step(
)
return
self._handle_compare_result(work1, work2, metrics)
if progress is not None:
print_progress_and_increase(progress)

def _handle_compare_result(
self,
Expand Down Expand Up @@ -573,12 +583,18 @@ def _handle_compare_result(
),
)

def _handle_completed_futures(self, processing: List[ProcessingWorksInfo]):
def _handle_completed_futures(
self,
processing: List[ProcessingWorksInfo],
progress: Optional[Progress],
):
for proc_works_info in processing:
metrics: CompareInfo = proc_works_info.compare_future.result()
self._handle_compare_result(
proc_works_info.work1, proc_works_info.work2, metrics, save=True
)
if progress is not None:
print_progress_and_increase(progress)

def _settings_show(self) -> None:
settings_config = read_settings_conf()
Expand All @@ -601,94 +617,94 @@ def _settings_modify(self) -> None:

write_settings_conf(settings_config)

def __many_to_many_check(
self,
features_from_files: List[ASTFeatures],
features_from_gh_files: List[ASTFeatures],
):
works: List[ASTFeatures] = []
works.extend(features_from_files)
works.extend(self.features_getter.get_from_dirs(self.directories))
works.extend(features_from_gh_files)
works.extend(
self.features_getter.get_from_github_project_folders(
self.github_project_folders
)
)
works.extend(self.features_getter.get_from_users_repos(self.github_user))

progress = None
if self.show_progress:
count_works = len(works)
progress = Progress(calc_iterations(count_works))

with ProcessPoolExecutor() as executor:
processed: List[ProcessingWorksInfo] = []
for i, work1 in enumerate(works):
for j, work2 in enumerate(works):
if i <= j:
continue
self._do_step(executor, processed, work1, work2, progress)
self._handle_completed_futures(processed, progress)

def __one_to_one_check(
self,
features_from_files: List[ASTFeatures],
features_from_gh_files: List[ASTFeatures],
):
combined_elements = filter(
bool,
(
features_from_files,
*self.features_getter.get_from_dirs(self.directories, independent=True),
features_from_gh_files,
*self.features_getter.get_from_github_project_folders(
self.github_project_folders, independent=True
),
*self.features_getter.get_from_users_repos(
self.github_user, independent=True
),
),
)
complex_progress = None
if self.show_progress:
combined_elements = list(combined_elements)
count_sequences = len(combined_elements)
complex_progress = ComplexProgress(
calc_iterations(count_sequences, self.mode)
)
cases = combinations(combined_elements, r=2)
with ProcessPoolExecutor() as executor:
processed: List[ProcessingWorksInfo] = []
for case in cases:
first_sequence, second_sequence = case
if complex_progress is not None:
complex_progress.add_internal_progress(
len(first_sequence) * len(second_sequence)
)
for work1 in first_sequence:
for work2 in second_sequence:
self._do_step(
executor, processed, work1, work2, complex_progress
)
self._handle_completed_futures(processed, complex_progress)

def __check(self) -> None:
self.logger.debug(
f"Mode: {self.mode}; Extension: {self.features_getter.extension}."
)

begin_time = time()

begin_time = monotonic()
features_from_files = self.features_getter.get_from_files(self.files)
features_from_gh_files = self.features_getter.get_from_github_files(
self.github_files
)

self.logger.info("Starting searching for plagiarism ...")
if self.mode == "many_to_many":
works: List[ASTFeatures] = []
works.extend(features_from_files)
works.extend(self.features_getter.get_from_dirs(self.directories))
works.extend(features_from_gh_files)
works.extend(
self.features_getter.get_from_github_project_folders(
self.github_project_folders
)
)
works.extend(self.features_getter.get_from_users_repos(self.github_user))

count_works = len(works)
iterations = calc_iterations(count_works)
iteration = 0
with ProcessPoolExecutor() as executor:
processed: List[ProcessingWorksInfo] = []
for i, work1 in enumerate(works):
for j, work2 in enumerate(works):
if i <= j:
continue

if self.show_progress:
calc_and_print_progress(iteration, iterations)
iteration += 1

self._do_step(executor, processed, work1, work2)
self._handle_completed_futures(processed)
self.__many_to_many_check(features_from_files, features_from_gh_files)
elif self.mode == "one_to_one":
combined_elements = filter(
bool,
(
features_from_files,
*self.features_getter.get_from_dirs(
self.directories, independent=True
),
features_from_gh_files,
*self.features_getter.get_from_github_project_folders(
self.github_project_folders, independent=True
),
*self.features_getter.get_from_users_repos(
self.github_user, independent=True
),
),
)
if self.show_progress:
combined_elements = list(combined_elements)
count_sequences = len(combined_elements)
iterations = calc_iterations(count_sequences, self.mode)
iteration = 0

cases = combinations(combined_elements, r=2)
with ProcessPoolExecutor() as executor:
processed: List[ProcessingWorksInfo] = []
for case in cases:
first_sequence, second_sequence = case
internal_iterations = len(first_sequence) * len(second_sequence)
internal_iteration = 0
for work1 in first_sequence:
for work2 in second_sequence:
if self.show_progress:
calc_and_print_progress(
iteration, # type: ignore
iterations, # type: ignore
internal_iteration,
internal_iterations,
)
internal_iteration += 1

self._do_step(executor, processed, work1, work2)
if self.show_progress:
iteration += 1 # type: ignore
self._handle_completed_futures(processed)

self.logger.debug(f"Time for all {time() - begin_time:.2f}s")
self.__one_to_one_check(features_from_files, features_from_gh_files)
self.logger.debug(f"Time for all {monotonic() - begin_time:.2f}s")
self.logger.info("Ending searching for plagiarism ...")

def _report_create(self) -> Literal[0, 1]:
Expand Down
Loading

0 comments on commit de098f6

Please sign in to comment.