finopsfoundation · johrenberger · Jan 14, 2024
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -36,3 +36,6 @@ jobs:
       - name: flake8
         run: |
           poetry run flake8 focus_validator/
+      - name: bandit
+        run: |
+          poetry run bandit -r focus_converter/ -ll
diff --git a/focus_validator/utils/profiler.py b/focus_validator/utils/profiler.py
@@ -0,0 +1,64 @@
+import cProfile
+import csv
+import functools
+import io
+import pstats
+
+
+class Profiler:
+    def __init__(self, csv_format=False):
+        self.csv_format = csv_format
+
+    def __call__(self, func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # Wrap and execute profile
+            profiler = cProfile.Profile()
+            profiler.enable()
+            result = func(*args, **kwargs)
+            profiler.disable()
+            profiling_result = pstats.Stats(profiler)
+
+            generate_csv_file(args, profiling_result)
+
+            generate_console_output(profiler)
+
+            return result
+
+        def generate_console_output(profiler):
+            s = io.StringIO()
+            sortby = "cumulative"
+            ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
+            ps.print_stats()
+            print(s.getvalue())
+
+        def generate_csv_file(args, profiling_result):
+            if self.csv_format:
+                # Determine the filename based on class and method name
+                csv_filename = generate_file_name(args)
+                with open(csv_filename, "w", newline="") as f:
+                    w = csv.writer(f)
+                    # Write the headers
+                    headers = [
+                        "ncalls",
+                        "tottime",
+                        "percall",
+                        "cumtime",
+                        "percall",
+                        "filename:lineno(function)",
+                    ]
+                    w.writerow(headers)
+
+                    # Write each row
+                    for row in profiling_result.stats.items():
+                        func_name, (cc, nc, tt, ct, callers) = row
+                        w.writerow([nc, tt, tt / nc, ct, ct / cc, func_name])
+
+        def generate_file_name(args):
+            class_name = args[0].__class__.__name__ if args else "global"
+            method_name = func.__name__
+            base_filename = f"{class_name}_{method_name}_profile"
+            csv_filename = f"{base_filename}.csv"
+            return csv_filename
+
+        return wrapper
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ requests = "*"
 pandera = "^0.16"
 polars = "^0.20.3"
 ddt = "^1.7.1"
+bandit = "^1.7.6"
 
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["d"], version = "^23.7.0"}
@@ -42,3 +43,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 focus-validator = "focus_validator.main:main"
+
+[tool.pytest.ini_options]
+log_cli = true
+log_cli_level = "INFO"
diff --git a/tests/test_performance_profiler.py b/tests/test_performance_profiler.py
@@ -7,6 +7,7 @@
 import time
 import unittest
 from ddt import ddt, data, unpack
+from focus_validator.utils.profiler import Profiler
 
 from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses
 from focus_validator.validator import Validator
@@ -17,19 +18,7 @@
 @ddt
 class TestPerformanceProfiler(unittest.TestCase):
 
-    def profile_to_csv(self, profiling_result, csv_file):
-        with open(csv_file, 'w', newline='') as f:
-            w = csv.writer(f)
-            # Write the headers
-            headers = ['ncalls', 'tottime', 'percall', 'cumtime', 'percall', 'filename:lineno(function)']
-            w.writerow(headers)
-
-            # Write each row
-            for row in profiling_result.stats.items():
-                func_name, (cc, nc, tt, ct, callers) = row
-                w.writerow([nc, tt, tt/nc, ct, ct/cc, func_name])
-
-    def execute_profiler(self, file_name, performance_threshold):
+    def measure_validator(self, file_name, performance_threshold):
         # Set the environment variable for logging level
         env = os.environ.copy()
         env["LOG_LEVEL"] = "INFO"
@@ -48,44 +37,29 @@ def execute_profiler(self, file_name, performance_threshold):
             column_namespace=None,
         )
 
-        # Set up the profiler
-        profiler = cProfile.Profile()
-        profiler.enable()
-
-        # The original performance testing code
+        # The measure execution
         start_time = time.time()
-        validator.validate()
+        self.run_and_profile_validator(validator)
         end_time = time.time()
         duration = end_time - start_time
         logging.info(f"File: {file_name} Duration: {duration} seconds")
 
-        # Stop the profiler
-        profiler.disable()
-
-        # Save profiling data to a file
-        profiling_result = pstats.Stats(profiler)
-        profile_file_name = "profiling_data_" + file_name
-        self.profile_to_csv(profiling_result, profile_file_name)
-
-        # Optionally print out profiling report to the console
-        s = io.StringIO()
-        sortby = 'cumulative'  # Can be changed to 'time', 'calls', etc.
-        ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
-        ps.print_stats(10)
-        logging.info(s.getvalue())
-
         #Execution time check
         self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")
 
+    @Profiler(csv_format=True)
+    def run_and_profile_validator(self, validator):
+        validator.validate()
+
     @data(
-        # ("fake_focuses500000.csv", 60.0, 500000, "validate_500000_records"),
+        # ("fake_focuses500000.csv", 110.0, 500000, "validate_500000_records"),
         # ("fake_focuses250000.csv", 60.0, 250000, "validate_250000_records"),
-        # ("fake_focuses100000.csv", 30.0, 100000, "validate_100000_records"),
-        # ("fake_focuses50000.csv", 15.0, 50000, "validate_50000_records"),
-        # ("fake_focuses10000.csv", 7.0, 10000, "validate_10000_records"),
-        # ("fake_focuses5000.csv", 3.0, 5000, "validate_5000_records"),
-        ("fake_focuses2000.csv", 3.0, 2000, "validate_2000_records"),
-        ("fake_focuses2000.csv", 3.0, 1000, "validate_1000_records")
+        # ("fake_focuses100000.csv", 20.0, 100000, "validate_100000_records"),
+        # ("fake_focuses50000.csv", 11.0, 50000, "validate_50000_records"),
+        # ("fake_focuses10000.csv", 2.5, 10000, "validate_10000_records"),
+        # ("fake_focuses5000.csv", 1.8, 5000, "validate_5000_records"),
+        ("fake_focuses2000.csv", 1.0, 2000, "validate_2000_records"),
+        ("fake_focuses2000.csv", 1.0, 1000, "validate_1000_records")
     )
     @unpack
     def test_param_validator_performance(self, file_name, performance_threshold, number_of_records, case_id):
@@ -94,9 +68,9 @@ def test_param_validator_performance(self, file_name, performance_threshold, num
             env = os.environ.copy()
             env["LOG_LEVEL"] = "INFO"
 
-            logging.info("Generating file with {number_of_records} records.")
+            logging.info(f"Generating file with {number_of_records} records.")
             generate_and_write_fake_focuses(file_name, number_of_records)
-            self.execute_profiler(str(file_name), performance_threshold)
+            self.measure_validator(str(file_name), performance_threshold)
 
             logging.info("Cleaning up test file.")
             base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

diff --git a/tests/test_progressive_performance.py b/tests/test_progressive_performance.py
@@ -3,76 +3,17 @@
 import subprocess
 import time
 import unittest
-
+from ddt import ddt, data, unpack
+from focus_validator.utils.profiler import Profiler
 from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses
 
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 
-
+@ddt
 class TestProgressivePerformance(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        #Generate 1000 fake focuses to a CSV file
-        cls.csv_filename_1000 = 'fake_focuses1000.csv'
-        cls.csv_filename_10000 = 'fake_focuses10000.csv'
-        cls.csv_filename_50000 = 'fake_focuses50000.csv'
-        cls.csv_filename_100000 = 'fake_focuses100000.csv'
-        cls.csv_filename_250000 = 'fake_focuses250000.csv'
-        cls.csv_filename_500000 = 'fake_focuses500000.csv'
-
-        logging.info("Generating file with 1,000 records")
-        cls.generate_test_file(str(cls.csv_filename_1000), 1000)
-
-        # logging.info("Generating file with 10,0000 records")
-        # cls.generate_test_file(str(cls.csv_filename_10000), 10000)
-
-        # logging.info("Generating file with 50,0000 records")
-        # cls.generate_test_file(str(cls.csv_filename_50000), 50000)
-
-        # logging.info("Generating file with 100,0000 records")
-        # cls.generate_test_file(str(cls.csv_filename_100000), 100000)
-
-        # logging.info("Generating file with 250,0000 records")
-        # cls.generate_test_file(str(cls.csv_filename_250000), 250000)
-
-        # logging.info("Generating file with 500,0000 records")
-        # cls.generate_test_file(str(cls.csv_filename_500000), 500000)
-
-    @classmethod
-    def tearDownClass(cls):
-        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-        if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')):
-            os.remove(os.path.join(base_dir, 'fake_focuses.csv'))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_1000)))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_10000)))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_50000)))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_100000)))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_250000)))
-
-        if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))):
-            os.remove(os.path.join(base_dir, str(cls.csv_filename_500000)))
-
-    @classmethod
-    def generate_test_file(cls, csv_filename, number_of_records):
-        #Generate fake focuses to a CSV file
-        # fake_focuses = generate_fake_focus(number_of_records)
-
-        # write_fake_focuses_to_csv(fake_focuses, csv_filename)
-        generate_and_write_fake_focuses(csv_filename, number_of_records)
-
 
+    @Profiler(csv_format=True)
     def run_validator(self, args):
         # Get the current directory of this test file
         test_dir = os.path.dirname(os.path.abspath(__file__))
@@ -86,25 +27,26 @@ def run_validator(self, args):
         command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args
         return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
 
-    def test_1000_record_csv_performance(self):
-        self.execute_performance(str(self.csv_filename_1000), 25.0)
+    @data(
+        # ("fake_focuses500000.csv", 115.0, 500000, "validate_500000_records"),
+        # ("fake_focuses250000.csv", 65.0, 250000, "validate_250000_records"),
+        # ("fake_focuses100000.csv", 25.0, 100000, "validate_100000_records"),
+        # ("fake_focuses50000.csv", 13.0, 50000, "validate_50000_records"),
+        # ("fake_focuses10000.csv", 5.0, 10000, "validate_10000_records"),
+        # ("fake_focuses5000.csv", 3.5, 5000, "validate_5000_records"),
+        ("fake_focuses2000.csv", 2.5, 2000, "validate_2000_records"),
+        ("fake_focuses2000.csv", 2.6, 1000, "validate_1000_records")
+    )
+    @unpack
+    def test_param_main_performance(self, file_name, performance_threshold, number_of_records, case_id):
+        with self.subTest(case_id=case_id):
+            generate_and_write_fake_focuses(file_name, number_of_records)
+            self.execute_performance(file_name, performance_threshold)
+            if os.path.exists(file_name):
+                os.remove(file_name)
 
-    # def test_10000_record_csv_performance(self):
-    #     self.execute_performance(str(self.csv_filename_10000), 25.0)
-
-    # def test_50000_record_csv_performance(self):
-    #     self.execute_performance(str(self.csv_filename_50000), 150.0)
-
-    # def test_100000_record_csv_performance(self):
-    #     self.execute_performance(str(self.csv_filename_100000), 300.0)
-
-    # def test_250000_record_csv_performance(self):
-    #     self.execute_performance(str(self.csv_filename_250000), 300.0)
-
-    # def test_500000_record_csv_performance(self):
-    #     self.execute_performance(str(self.csv_filename_500000), 300.0)
-
     def execute_performance(self, file_name, performance_threshold):
+
         # Get the current directory of this test file
         test_dir = os.path.dirname(os.path.abspath(__file__))