Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create Base Stat Monitor class to help creating custom monitors #325

Merged
merged 4 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/source/monitors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,21 @@ Here is an example of how to configure a new monitor suite in your project:

``result`` stats of the spider execution

Base Stat Monitor
-----------------

Most of the monitors we create validate a numerical value from job stats against a configurable
threshold. This is a common pattern that leads us to create almost repeated code for any new monitor
we add to our projects.

To reduce the amount of boilerplate code, we have this base class that your custom monitor can
inherit from and with a few attributes you end with a full functional monitor that just needs
to be added to your Monitor Suite to be used.

.. automodule:: spidermon.contrib.scrapy.monitors
:members: BaseStatMonitor
:noindex:

The Basic Monitors
------------------

Expand Down
124 changes: 124 additions & 0 deletions spidermon/contrib/scrapy/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,130 @@ def monitor_description(self):
return super().monitor_description


class BaseStatMonitor(BaseScrapyMonitor):
"""Base Monitor class for stat-related monitors.

Create a monitor class inheriting from this class to have a custom
monitor that validates numerical stats from your job execution
against a configurable threshold.

As an example, we will create a new monitor that will check if the
value obtained in a job stat 'numerical_job_statistic' is greater than
or equal to the value configured in ``CUSTOM_STAT_THRESHOLD`` project
setting:

.. code-block:: python

class MyCustomStatMonitor(BaseStatMonitor):
stat_name = "numerical_job_statistic"
threshold_setting = "CUSTOM_STAT_THRESHOLD"
assert_type = ">="

For the ``assert_type`` property you can select one of the following:

== =====================
> Greater than
>= Greater than or equal
< Less than
<= Less than or equal
== Equal
!= Not equal
== =====================

Sometimes, we don't want a fixed threshold, but a dynamic based on more than
one stat or getting data external from the job execution (e.g., you want the
threshold to be related to another stat, or you want to get the value
of a stat from a previous job).

As an example, the following monitor will use as threshold the a variable number
of errors allowed based on the number of items scraped. So this monitor will pass
only if the number of errors is less than 1% of the number of items scraped:

.. code-block:: python

class MyCustomStatMonitor(BaseStatMonitor):
stat_name = "log_count/ERROR"
assert_type = "<"

def get_threshold(self):
item_scraped_count = self.stats.get("item_scraped_count")
return item_scraped_count * 0.01

By default, if the stat can't be found in job statistics, the monitor will fail.
If you want the monitor to be skipped in that case, you should set ``fail_if_stat_missing``
attribute as ``False``.


The following monitor will not fail if the job doesn't have a ``numerical_job_statistic``
value in its statistics:

.. code-block:: python

class MyCustomStatMonitor(BaseStatMonitor):
stat_name = "numerical_job_statistic"
threshold_setting = "CUSTOM_STAT_THRESHOLD"
assert_type = ">="
fail_if_stat_missing = False
"""

fail_if_stat_missing = True

def run(self, result):
has_threshold_config = any(
[hasattr(self, "threshold_setting"), hasattr(self, "get_threshold")]
)
if not has_threshold_config:
raise NotConfigured(
f"{self.__class__.__name__} should include a a `threshold_setting` attribute "
"to be configured in your project settings with the desired threshold "
"or a `get_threshold` method that returns the desired threshold."
)

if (
hasattr(self, "threshold_setting")
and self.threshold_setting not in self.crawler.settings.attributes
):
raise NotConfigured(
f"Configure {self.threshold_setting} to your project"
f"settings to use {self.monitor_name}."
)

return super().run(result)

def _get_threshold_value(self):
if hasattr(self, "get_threshold"):
return self.get_threshold()
return self.crawler.settings.get(self.threshold_setting)

def test_stat_monitor(self):
assertions = {
">": self.assertGreater,
">=": self.assertGreaterEqual,
"<": self.assertLess,
"<=": self.assertLessEqual,
"==": self.assertEqual,
"!=": self.assertNotEqual,
}
threshold = self._get_threshold_value()

if self.stat_name not in self.stats:
message = f"Unable to find '{self.stat_name}' in job stats."
if self.fail_if_stat_missing:
self.fail(message)
else:
self.skipTest(message)

value = self.stats.get(self.stat_name)

assertion_method = assertions.get(self.assert_type)
assertion_method(
value,
threshold,
msg=f"Expecting '{self.stat_name}' to be '{self.assert_type}' "
f"to '{threshold}'. Current value: '{value}'",
)


@monitors.name("Extracted Items Monitor")
class ItemCountMonitor(BaseScrapyMonitor):
"""Check if spider extracted the minimum number of items.
Expand Down
2 changes: 1 addition & 1 deletion spidermon/results/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def addFailure(self, test, error):
@monitors_step_required
def addSkip(self, test, reason):
super().addSkip(test, reason)
self.step[test].status = settings.MONITOR.STATUS.FAILURE
self.step[test].status = settings.MONITOR.STATUS.SKIPPED
self.step[test].reason = reason

@monitors_step_required
Expand Down
168 changes: 168 additions & 0 deletions tests/contrib/scrapy/monitors/test_base_stat_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import pytest
from spidermon.contrib.scrapy.monitors import (
BaseStatMonitor,
)
from spidermon import MonitorSuite
from spidermon.exceptions import NotConfigured
from spidermon import settings


@pytest.mark.parametrize(
"assertion_type,stat_value,threshold,expected_status",
[
("==", 90, 100, settings.MONITOR.STATUS.FAILURE),
("==", 100, 100, settings.MONITOR.STATUS.SUCCESS),
("==", 110, 100, settings.MONITOR.STATUS.FAILURE),
("!=", 90, 100, settings.MONITOR.STATUS.SUCCESS),
("!=", 100, 100, settings.MONITOR.STATUS.FAILURE),
("!=", 110, 100, settings.MONITOR.STATUS.SUCCESS),
(">", 99, 100, settings.MONITOR.STATUS.FAILURE),
(">", 100.1, 100, settings.MONITOR.STATUS.SUCCESS),
(">", 100, 100, settings.MONITOR.STATUS.FAILURE),
(">", 101, 100, settings.MONITOR.STATUS.SUCCESS),
(">=", 99, 100, settings.MONITOR.STATUS.FAILURE),
(">=", 100, 100, settings.MONITOR.STATUS.SUCCESS),
(">=", 101, 100, settings.MONITOR.STATUS.SUCCESS),
("<", 99, 100, settings.MONITOR.STATUS.SUCCESS),
("<", 99.9, 100, settings.MONITOR.STATUS.SUCCESS),
("<", 100, 100, settings.MONITOR.STATUS.FAILURE),
("<", 101, 100, settings.MONITOR.STATUS.FAILURE),
("<=", 99, 100, settings.MONITOR.STATUS.SUCCESS),
("<=", 100, 100, settings.MONITOR.STATUS.SUCCESS),
("<=", 101, 100, settings.MONITOR.STATUS.FAILURE),
],
)
def test_base_stat_monitor_assertion_types(
make_data, assertion_type, stat_value, threshold, expected_status
):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = assertion_type

data = make_data({TestBaseStatMonitor.threshold_setting: threshold})
runner = data.pop("runner")
data["stats"][TestBaseStatMonitor.stat_name] = stat_value
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert runner.result.monitor_results[0].status == expected_status


def test_base_stat_monitor_raise_not_configured_if_setting_not_provided(make_data):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = "<"

data = make_data()
runner = data.pop("runner")
data["stats"][TestBaseStatMonitor.stat_name] = 100
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

with pytest.raises(NotConfigured):
runner.run(monitor_suite, **data)


def test_not_configured_without_threshold_setting_or_method(make_data):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
assert_type = "=="

data = make_data()
runner = data.pop("runner")
data["stats"][TestBaseStatMonitor.stat_name] = 100
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

with pytest.raises(NotConfigured):
runner.run(monitor_suite, **data)


def test_base_stat_monitor_using_get_threshold_method(make_data):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
assert_type = "=="

def get_threshold(self):
return 100

data = make_data()
runner = data.pop("runner")
data["stats"][TestBaseStatMonitor.stat_name] = 100
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.SUCCESS


def test_failure_message_describe_values_expected(make_data):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = "=="

expected_threshold = 100
obtained_value = 90
data = make_data({TestBaseStatMonitor.threshold_setting: expected_threshold})

runner = data.pop("runner")
data["stats"][TestBaseStatMonitor.stat_name] = obtained_value
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert (
runner.result.monitor_results[0].reason
== f"Expecting '{TestBaseStatMonitor.stat_name}' to be '{TestBaseStatMonitor.assert_type}' "
f"to '{expected_threshold}'. Current value: '{obtained_value}'",
)


def test_fail_if_stat_can_not_be_found(make_data):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = ">="

data = make_data({"THRESHOLD_SETTING": 100})
runner = data.pop("runner")
data["stats"] = {"other_stats": 1}
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.FAILURE


def test_success_if_stat_can_not_be_found_but_monitor_configured_to_not_ignore_it(
make_data,
):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = ">="
fail_if_stat_missing = True

data = make_data({"THRESHOLD_SETTING": 100})
runner = data.pop("runner")
data["stats"] = {"other_stats": 1}
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.FAILURE


def test_skipped_if_stat_can_not_be_found_but_monitor_configured_to_be_ignore(
make_data,
):
class TestBaseStatMonitor(BaseStatMonitor):
stat_name = "test_statistic"
threshold_setting = "THRESHOLD_SETTING"
assert_type = ">="
fail_if_stat_missing = False

data = make_data({"THRESHOLD_SETTING": 100})
runner = data.pop("runner")
data["stats"] = {"other_stats": 1}
monitor_suite = MonitorSuite(monitors=[TestBaseStatMonitor])

runner.run(monitor_suite, **data)
assert runner.result.monitor_results[0].status == settings.MONITOR.STATUS.SKIPPED