Skip to content

Commit

Permalink
Fixes Scrapinghub jobs.list Method Called with Incorrect Filter Param…
Browse files Browse the repository at this point in the history
…eter (#446)

* Fixes Scrapinghub jobs.list Method Called with Incorrect Filter Parameter

* Return empty list on empty cases

* Add _get_tags_to_filter tests
  • Loading branch information
VMRuiz authored May 7, 2024
1 parent f0d8ea5 commit 562c024
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
10 changes: 5 additions & 5 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def _get_jobs(self, states, number_of_jobs):
start=start,
state=states,
count=count,
filters=dict(has_tag=tags) if tags else None,
has_tag=tags or None,
)
total_jobs.extend(current_jobs)

Expand All @@ -584,19 +584,19 @@ def _get_jobs(self, states, number_of_jobs):

def _get_tags_to_filter(self):
"""
Return the intersect of the desired tags to filter and
Return a list of tags with the intersection of the desired tags to filter and
the ones from the current job.
"""
desired_tags = self.crawler.settings.getlist(SPIDERMON_JOBS_COMPARISON_TAGS)
if not desired_tags:
return {}
return []

current_tags = json.loads(os.environ.get("SHUB_JOB_DATA", "{}")).get("tags")
if not current_tags:
return {}
return []

tags_to_filter = set(desired_tags) & set(current_tags)
return sorted(tags_to_filter)
return list(sorted(tags_to_filter))

def get_threshold(self):
number_of_jobs = self.crawler.settings.getint(SPIDERMON_JOBS_COMPARISON)
Expand Down
22 changes: 21 additions & 1 deletion tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,26 @@ def runTest():
pass


def test_jobs_comparison_monitor_get_tags_to_filter(monkeypatch):
mock_data = Mock()

monitor = TestZyteJobsComparisonMonitor()
monitor.data = mock_data

# Empty SPIDERMON_JOBS_COMPARISON_TAGS
mock_data.crawler.settings.getlist.return_value = None
assert monitor._get_tags_to_filter() == []

# Empty SHUB_JOB_DATA.tags
mock_data.crawler.settings.getlist.return_value = ["tag1", "tag2"]
assert monitor._get_tags_to_filter() == []

# Sorted intersection
mock_data.crawler.settings.getlist.return_value = ["tag2", "tag1", "tag3"]
monkeypatch.setenv("SHUB_JOB_DATA", '{"tags": ["tag1", "tag2"]}')
assert monitor._get_tags_to_filter() == ["tag1", "tag2"]


def test_jobs_comparison_monitor_get_jobs():
mock_client = Mock()
with patch(
Expand Down Expand Up @@ -197,7 +217,7 @@ def test_arguments_passed_to_zyte_client(
state=list(states),
# Count goes from pending number of jobs up to 1000
count=min(number_of_jobs - n * 1000, 1000),
filters={"has_tag": list(tags)},
has_tag=list(tags),
)
# One call to api every 1000 expected jobs
for n in range(0, math.ceil(number_of_jobs / 1000))
Expand Down

0 comments on commit 562c024

Please sign in to comment.