scrapinghub · rennerocha · Dec 14, 2021 · Nov 29, 2021 · Dec 3, 2021 · Dec 3, 2021
diff --git a/docs/source/monitors.rst b/docs/source/monitors.rst
@@ -130,6 +130,21 @@ Here is an example of how to configure a new monitor suite in your project:
 
       ``result`` stats of the spider execution
 
+Base Stat Monitor
+-----------------
+
+Most of the monitors we create validate a numerical value from job stats against a configurable
+threshold. This is a common pattern that leads us to create almost repeated code for any new monitor
+we add to our projects.
+
+To reduce the amount of boilerplate code, we have this base class that your custom monitor can
+inherit from and with a few attributes you end with a full functional monitor that just needs
+to be added to your Monitor Suite to be used.
+
+.. automodule:: spidermon.contrib.scrapy.monitors
+    :members: BaseStatMonitor
+    :noindex:
+
 The Basic Monitors
 ------------------
 

diff --git a/spidermon/contrib/scrapy/monitors.py b/spidermon/contrib/scrapy/monitors.py
@@ -27,6 +27,130 @@ def monitor_description(self):
         return super().monitor_description
 
 
+class BaseStatMonitor(BaseScrapyMonitor):
+    """Base Monitor class for stat-related monitors.
+
+    Create a monitor class inheriting from this class to have a custom
+    monitor that validates numerical stats from your job execution
+    against a configurable threshold.
+
+    As an example, we will create a new monitor that will check if the
+    value obtained in a job stat 'numerical_job_statistic' is greater than
+    or equal to the value configured in ``CUSTOM_STAT_THRESHOLD`` project
+    setting:
+
+    .. code-block:: python
+
+        class MyCustomStatMonitor(BaseStatMonitor):
+            stat_name = "numerical_job_statistic"
+            threshold_setting = "CUSTOM_STAT_THRESHOLD"
+            assert_type = ">="
+
+    For the ``assert_type`` property you can select one of the following:
+
+    ==  =====================
+    >   Greater than
+    >=  Greater than or equal
+    <   Less than
+    <=  Less than or equal
+    ==  Equal
+    !=  Not equal
+    ==  =====================
+
+    Sometimes, we don't want a fixed threshold, but a dynamic based on more than
+    one stat or getting data external from the job execution (e.g., you want the
+    threshold to be related to another stat, or you want to get the value
+    of a stat from a previous job).
+
+    As an example, the following monitor will use as threshold the a variable number
+    of errors allowed based on the number of items scraped. So this monitor will pass
+    only if the number of errors is less than 1% of the number of items scraped:
+
+    .. code-block:: python
+
+        class MyCustomStatMonitor(BaseStatMonitor):
+            stat_name = "log_count/ERROR"
+            assert_type = "<"
+
+            def get_threshold(self):
+                item_scraped_count = self.stats.get("item_scraped_count")
+                return item_scraped_count * 0.01
+
+    By default, if the stat can't be found in job statistics, the monitor will fail.
+    If you want the monitor to be skipped in that case, you should set ``fail_if_stat_missing``
+    attribute as ``False``.
+
+
+    The following monitor will not fail if the job doesn't have a ``numerical_job_statistic``
+    value in its statistics:
+
+    .. code-block:: python
+
+        class MyCustomStatMonitor(BaseStatMonitor):
+            stat_name = "numerical_job_statistic"
+            threshold_setting = "CUSTOM_STAT_THRESHOLD"
+            assert_type = ">="
+            fail_if_stat_missing = False
+    """
+
+    fail_if_stat_missing = True
+
+    def run(self, result):
+        has_threshold_config = any(
+            [hasattr(self, "threshold_setting"), hasattr(self, "get_threshold")]
+        )
+        if not has_threshold_config:
+            raise NotConfigured(
+                f"{self.__class__.__name__} should include a a `threshold_setting` attribute "
+                "to be configured in your project settings with the desired threshold "
+                "or a `get_threshold` method that returns the desired threshold."
+            )
+
+        if (
+            hasattr(self, "threshold_setting")
+            and self.threshold_setting not in self.crawler.settings.attributes
+        ):
+            raise NotConfigured(
+                f"Configure {self.threshold_setting} to your project"
+                f"settings to use {self.monitor_name}."
+            )
+
+        return super().run(result)
+
+    def _get_threshold_value(self):
+        if hasattr(self, "get_threshold"):
+            return self.get_threshold()
+        return self.crawler.settings.get(self.threshold_setting)
+
+    def test_stat_monitor(self):
+        assertions = {
+            ">": self.assertGreater,
+            ">=": self.assertGreaterEqual,
+            "<": self.assertLess,
+            "<=": self.assertLessEqual,
+            "==": self.assertEqual,
+            "!=": self.assertNotEqual,
+        }
+        threshold = self._get_threshold_value()
+
+        if self.stat_name not in self.stats:
+            message = f"Unable to find '{self.stat_name}' in job stats."
+            if self.fail_if_stat_missing:
+                self.fail(message)
+            else:
+                self.skipTest(message)
+
+        value = self.stats.get(self.stat_name)
+
+        assertion_method = assertions.get(self.assert_type)
+        assertion_method(
+            value,
+            threshold,
+            msg=f"Expecting '{self.stat_name}' to be '{self.assert_type}' "
+            f"to '{threshold}'. Current value: '{value}'",
+        )
+
+
 @monitors.name("Extracted Items Monitor")
 class ItemCountMonitor(BaseScrapyMonitor):
     """Check if spider extracted the minimum number of items.

diff --git a/spidermon/results/monitor.py b/spidermon/results/monitor.py
@@ -107,7 +107,7 @@ def addFailure(self, test, error):
     @monitors_step_required
     def addSkip(self, test, reason):
         super().addSkip(test, reason)
-        self.step[test].status = settings.MONITOR.STATUS.FAILURE
+        self.step[test].status = settings.MONITOR.STATUS.SKIPPED
         self.step[test].reason = reason
 
     @monitors_step_required

diff --git a/tests/contrib/scrapy/monitors/test_base_stat_monitor.py b/tests/contrib/scrapy/monitors/test_base_stat_monitor.py
@@ -0,0 +1,168 @@
+import pytest
+from spidermon.contrib.scrapy.monitors import (
+    BaseStatMonitor,
+)
+from spidermon import MonitorSuite
+from spidermon.exceptions import NotConfigured
+from spidermon import settings
+
+
+@pytest.mark.parametrize(
+    "assertion_type,stat_value,threshold,expected_status",
+    [
+        ("==", 90, 100, settings.MONITOR.STATUS.FAILURE),
+        ("==", 100, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("==", 110, 100, settings.MONITOR.STATUS.FAILURE),
+        ("!=", 90, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("!=", 100, 100, settings.MONITOR.STATUS.FAILURE),
+        ("!=", 110, 100, settings.MONITOR.STATUS.SUCCESS),
+        (">", 99, 100, settings.MONITOR.STATUS.FAILURE),
+        (">", 100.1, 100, settings.MONITOR.STATUS.SUCCESS),
+        (">", 100, 100, settings.MONITOR.STATUS.FAILURE),
+        (">", 101, 100, settings.MONITOR.STATUS.SUCCESS),
+        (">=", 99, 100, settings.MONITOR.STATUS.FAILURE),
+        (">=", 100, 100, settings.MONITOR.STATUS.SUCCESS),
+        (">=", 101, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("<", 99, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("<", 99.9, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("<", 100, 100, settings.MONITOR.STATUS.FAILURE),
+        ("<", 101, 100, settings.MONITOR.STATUS.FAILURE),
+        ("<=", 99, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("<=", 100, 100, settings.MONITOR.STATUS.SUCCESS),
+        ("<=", 101, 100, settings.MONITOR.STATUS.FAILURE),
+    ],
+)
+def test_base_stat_monitor_assertion_types(
+    make_data, assertion_type, stat_value, threshold, expected_status
+):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = assertion_type
+
+    data = make_data({TestBaseStatMonitor.threshold_setting: threshold})
+    runner = data.pop("runner")
+    data["stats"][TestBaseStatMonitor.stat_name] = stat_value
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert runner.result.monitor_results[0].status == expected_status
+
+
+def test_base_stat_monitor_raise_not_configured_if_setting_not_provided(make_data):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = "<"
+
+    data = make_data()
+    runner = data.pop("runner")
+    data["stats"][TestBaseStatMonitor.stat_name] = 100
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    with pytest.raises(NotConfigured):
+        runner.run(monitor_suite, **data)
+
+
+def test_not_configured_without_threshold_setting_or_method(make_data):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        assert_type = "=="
+
+    data = make_data()
+    runner = data.pop("runner")
+    data["stats"][TestBaseStatMonitor.stat_name] = 100
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    with pytest.raises(NotConfigured):
+        runner.run(monitor_suite, **data)
+
+
+def test_base_stat_monitor_using_get_threshold_method(make_data):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        assert_type = "=="
+
+        def get_threshold(self):
+            return 100
+
+    data = make_data()
+    runner = data.pop("runner")
+    data["stats"][TestBaseStatMonitor.stat_name] = 100
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.SUCCESS
+
+
+def test_failure_message_describe_values_expected(make_data):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = "=="
+
+    expected_threshold = 100
+    obtained_value = 90
+    data = make_data({TestBaseStatMonitor.threshold_setting: expected_threshold})
+
+    runner = data.pop("runner")
+    data["stats"][TestBaseStatMonitor.stat_name] = obtained_value
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert (
+        runner.result.monitor_results[0].reason
+        == f"Expecting '{TestBaseStatMonitor.stat_name}' to be '{TestBaseStatMonitor.assert_type}' "
+        f"to '{expected_threshold}'. Current value: '{obtained_value}'",
+    )
+
+
+def test_fail_if_stat_can_not_be_found(make_data):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = ">="
+
+    data = make_data({"THRESHOLD_SETTING": 100})
+    runner = data.pop("runner")
+    data["stats"] = {"other_stats": 1}
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.FAILURE
+
+
+def test_success_if_stat_can_not_be_found_but_monitor_configured_to_not_ignore_it(
+    make_data,
+):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = ">="
+        fail_if_stat_missing = True
+
+    data = make_data({"THRESHOLD_SETTING": 100})
+    runner = data.pop("runner")
+    data["stats"] = {"other_stats": 1}
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.FAILURE
+
+
+def test_skipped_if_stat_can_not_be_found_but_monitor_configured_to_be_ignore(
+    make_data,
+):
+    class TestBaseStatMonitor(BaseStatMonitor):
+        stat_name = "test_statistic"
+        threshold_setting = "THRESHOLD_SETTING"
+        assert_type = ">="
+        fail_if_stat_missing = False
+
+    data = make_data({"THRESHOLD_SETTING": 100})
+    runner = data.pop("runner")
+    data["stats"] = {"other_stats": 1}
+    monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])
+
+    runner.run(monitor_suite, **data)
+    assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.SKIPPED