diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 2e1efc9..0cb495a 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -36,3 +36,6 @@ jobs: - name: flake8 run: | poetry run flake8 focus_validator/ + - name: bandit + run: | + poetry run bandit -r focus_converter/ -ll diff --git a/focus_validator/utils/profiler.py b/focus_validator/utils/profiler.py new file mode 100644 index 0000000..a91ec82 --- /dev/null +++ b/focus_validator/utils/profiler.py @@ -0,0 +1,64 @@ +import cProfile +import csv +import functools +import io +import pstats + + +class Profiler: + def __init__(self, csv_format=False): + self.csv_format = csv_format + + def __call__(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Wrap and execute profile + profiler = cProfile.Profile() + profiler.enable() + result = func(*args, **kwargs) + profiler.disable() + profiling_result = pstats.Stats(profiler) + + generate_csv_file(args, profiling_result) + + generate_console_output(profiler) + + return result + + def generate_console_output(profiler): + s = io.StringIO() + sortby = "cumulative" + ps = pstats.Stats(profiler, stream=s).sort_stats(sortby) + ps.print_stats() + print(s.getvalue()) + + def generate_csv_file(args, profiling_result): + if self.csv_format: + # Determine the filename based on class and method name + csv_filename = generate_file_name(args) + with open(csv_filename, "w", newline="") as f: + w = csv.writer(f) + # Write the headers + headers = [ + "ncalls", + "tottime", + "percall", + "cumtime", + "percall", + "filename:lineno(function)", + ] + w.writerow(headers) + + # Write each row + for row in profiling_result.stats.items(): + func_name, (cc, nc, tt, ct, callers) = row + w.writerow([nc, tt, tt / nc, ct, ct / cc, func_name]) + + def generate_file_name(args): + class_name = args[0].__class__.__name__ if args else "global" + method_name = func.__name__ + base_filename = f"{class_name}_{method_name}_profile" + csv_filename = f"{base_filename}.csv" + return csv_filename + + return wrapper diff --git a/pyproject.toml b/pyproject.toml index 2818143..935cb60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ requests = "*" pandera = "^0.16" polars = "^0.20.3" ddt = "^1.7.1" +bandit = "^1.7.6" [tool.poetry.group.dev.dependencies] black = {extras = ["d"], version = "^23.7.0"} @@ -42,3 +43,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] focus-validator = "focus_validator.main:main" + +[tool.pytest.ini_options] +log_cli = true +log_cli_level = "INFO" diff --git a/tests/test_performance_profiler.py b/tests/test_performance_profiler.py index bb2b568..4dc3a73 100644 --- a/tests/test_performance_profiler.py +++ b/tests/test_performance_profiler.py @@ -7,6 +7,7 @@ import time import unittest from ddt import ddt, data, unpack +from focus_validator.utils.profiler import Profiler from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses from focus_validator.validator import Validator @@ -17,19 +18,7 @@ @ddt class TestPerformanceProfiler(unittest.TestCase): - def profile_to_csv(self, profiling_result, csv_file): - with open(csv_file, 'w', newline='') as f: - w = csv.writer(f) - # Write the headers - headers = ['ncalls', 'tottime', 'percall', 'cumtime', 'percall', 'filename:lineno(function)'] - w.writerow(headers) - - # Write each row - for row in profiling_result.stats.items(): - func_name, (cc, nc, tt, ct, callers) = row - w.writerow([nc, tt, tt/nc, ct, ct/cc, func_name]) - - def execute_profiler(self, file_name, performance_threshold): + def measure_validator(self, file_name, performance_threshold): # Set the environment variable for logging level env = os.environ.copy() env["LOG_LEVEL"] = "INFO" @@ -48,44 +37,29 @@ def execute_profiler(self, file_name, performance_threshold): column_namespace=None, ) - # Set up the profiler - profiler = cProfile.Profile() - profiler.enable() - - # The original performance testing code + # The measure execution start_time = time.time() - validator.validate() + self.run_and_profile_validator(validator) end_time = time.time() duration = end_time - start_time logging.info(f"File: {file_name} Duration: {duration} seconds") - # Stop the profiler - profiler.disable() - - # Save profiling data to a file - profiling_result = pstats.Stats(profiler) - profile_file_name = "profiling_data_" + file_name - self.profile_to_csv(profiling_result, profile_file_name) - - # Optionally print out profiling report to the console - s = io.StringIO() - sortby = 'cumulative' # Can be changed to 'time', 'calls', etc. - ps = pstats.Stats(profiler, stream=s).sort_stats(sortby) - ps.print_stats(10) - logging.info(s.getvalue()) - #Execution time check self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds") + @Profiler(csv_format=True) + def run_and_profile_validator(self, validator): + validator.validate() + @data( - # ("fake_focuses500000.csv", 60.0, 500000, "validate_500000_records"), + # ("fake_focuses500000.csv", 110.0, 500000, "validate_500000_records"), # ("fake_focuses250000.csv", 60.0, 250000, "validate_250000_records"), - # ("fake_focuses100000.csv", 30.0, 100000, "validate_100000_records"), - # ("fake_focuses50000.csv", 15.0, 50000, "validate_50000_records"), - # ("fake_focuses10000.csv", 7.0, 10000, "validate_10000_records"), - # ("fake_focuses5000.csv", 3.0, 5000, "validate_5000_records"), - ("fake_focuses2000.csv", 3.0, 2000, "validate_2000_records"), - ("fake_focuses2000.csv", 3.0, 1000, "validate_1000_records") + # ("fake_focuses100000.csv", 20.0, 100000, "validate_100000_records"), + # ("fake_focuses50000.csv", 11.0, 50000, "validate_50000_records"), + # ("fake_focuses10000.csv", 2.5, 10000, "validate_10000_records"), + # ("fake_focuses5000.csv", 1.8, 5000, "validate_5000_records"), + ("fake_focuses2000.csv", 1.0, 2000, "validate_2000_records"), + ("fake_focuses2000.csv", 1.0, 1000, "validate_1000_records") ) @unpack def test_param_validator_performance(self, file_name, performance_threshold, number_of_records, case_id): @@ -94,9 +68,9 @@ def test_param_validator_performance(self, file_name, performance_threshold, num env = os.environ.copy() env["LOG_LEVEL"] = "INFO" - logging.info("Generating file with {number_of_records} records.") + logging.info(f"Generating file with {number_of_records} records.") generate_and_write_fake_focuses(file_name, number_of_records) - self.execute_profiler(str(file_name), performance_threshold) + self.measure_validator(str(file_name), performance_threshold) logging.info("Cleaning up test file.") base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/tests/test_progressive_performance.py b/tests/test_progressive_performance.py index 02e9799..ab0fd23 100644 --- a/tests/test_progressive_performance.py +++ b/tests/test_progressive_performance.py @@ -3,76 +3,17 @@ import subprocess import time import unittest - +from ddt import ddt, data, unpack +from focus_validator.utils.profiler import Profiler from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s') - +@ddt class TestProgressivePerformance(unittest.TestCase): - @classmethod - def setUpClass(cls): - #Generate 1000 fake focuses to a CSV file - cls.csv_filename_1000 = 'fake_focuses1000.csv' - cls.csv_filename_10000 = 'fake_focuses10000.csv' - cls.csv_filename_50000 = 'fake_focuses50000.csv' - cls.csv_filename_100000 = 'fake_focuses100000.csv' - cls.csv_filename_250000 = 'fake_focuses250000.csv' - cls.csv_filename_500000 = 'fake_focuses500000.csv' - - logging.info("Generating file with 1,000 records") - cls.generate_test_file(str(cls.csv_filename_1000), 1000) - - # logging.info("Generating file with 10,0000 records") - # cls.generate_test_file(str(cls.csv_filename_10000), 10000) - - # logging.info("Generating file with 50,0000 records") - # cls.generate_test_file(str(cls.csv_filename_50000), 50000) - - # logging.info("Generating file with 100,0000 records") - # cls.generate_test_file(str(cls.csv_filename_100000), 100000) - - # logging.info("Generating file with 250,0000 records") - # cls.generate_test_file(str(cls.csv_filename_250000), 250000) - - # logging.info("Generating file with 500,0000 records") - # cls.generate_test_file(str(cls.csv_filename_500000), 500000) - - @classmethod - def tearDownClass(cls): - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')): - os.remove(os.path.join(base_dir, 'fake_focuses.csv')) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_1000))) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_10000))) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_50000))) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_100000))) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_250000))) - - if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))): - os.remove(os.path.join(base_dir, str(cls.csv_filename_500000))) - - @classmethod - def generate_test_file(cls, csv_filename, number_of_records): - #Generate fake focuses to a CSV file - # fake_focuses = generate_fake_focus(number_of_records) - - # write_fake_focuses_to_csv(fake_focuses, csv_filename) - generate_and_write_fake_focuses(csv_filename, number_of_records) - + @Profiler(csv_format=True) def run_validator(self, args): # Get the current directory of this test file test_dir = os.path.dirname(os.path.abspath(__file__)) @@ -86,25 +27,26 @@ def run_validator(self, args): command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) - def test_1000_record_csv_performance(self): - self.execute_performance(str(self.csv_filename_1000), 25.0) + @data( + # ("fake_focuses500000.csv", 115.0, 500000, "validate_500000_records"), + # ("fake_focuses250000.csv", 65.0, 250000, "validate_250000_records"), + # ("fake_focuses100000.csv", 25.0, 100000, "validate_100000_records"), + # ("fake_focuses50000.csv", 13.0, 50000, "validate_50000_records"), + # ("fake_focuses10000.csv", 5.0, 10000, "validate_10000_records"), + # ("fake_focuses5000.csv", 3.5, 5000, "validate_5000_records"), + ("fake_focuses2000.csv", 2.5, 2000, "validate_2000_records"), + ("fake_focuses2000.csv", 2.6, 1000, "validate_1000_records") + ) + @unpack + def test_param_main_performance(self, file_name, performance_threshold, number_of_records, case_id): + with self.subTest(case_id=case_id): + generate_and_write_fake_focuses(file_name, number_of_records) + self.execute_performance(file_name, performance_threshold) + if os.path.exists(file_name): + os.remove(file_name) - # def test_10000_record_csv_performance(self): - # self.execute_performance(str(self.csv_filename_10000), 25.0) - - # def test_50000_record_csv_performance(self): - # self.execute_performance(str(self.csv_filename_50000), 150.0) - - # def test_100000_record_csv_performance(self): - # self.execute_performance(str(self.csv_filename_100000), 300.0) - - # def test_250000_record_csv_performance(self): - # self.execute_performance(str(self.csv_filename_250000), 300.0) - - # def test_500000_record_csv_performance(self): - # self.execute_performance(str(self.csv_filename_500000), 300.0) - def execute_performance(self, file_name, performance_threshold): + # Get the current directory of this test file test_dir = os.path.dirname(os.path.abspath(__file__))