From e2be73196b904a9f3a59d16d8eb9485001160158 Mon Sep 17 00:00:00 2001 From: Ian Nesbitt Date: Tue, 31 Oct 2023 18:23:19 +0000 Subject: [PATCH] adding `lastmod_filter` handling (#41) --- soscan/spiders/jsonldspider.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/soscan/spiders/jsonldspider.py b/soscan/spiders/jsonldspider.py index b8fea667..0239748e 100644 --- a/soscan/spiders/jsonldspider.py +++ b/soscan/spiders/jsonldspider.py @@ -91,6 +91,11 @@ def from_crawler(cls, crawler, *args, **kwargs): for s in _cs: spider.settings.set(s, _cs[s], priority='spider') spider.logger.info(f'Setting override from {mn_settings}: set {s} to {_cs[s]}') + if s in "lastmod_filter": + spider.lastmod_filter = dateparser.parse( + _cs[s], + settings={"RETURN_AS_TIMEZONE_AWARE": True}, + ) return spider def sitemap_filter(self, entries): @@ -123,6 +128,8 @@ def sitemap_filter(self, entries): if self.lastmod_filter is not None and ts is not None: if ts > self.lastmod_filter: yield entry + else: + self.logger.debug(f'lastmod_filter skipping {entry}') else: yield entry