From 36edee6ebf51e3c85c83158d9d900e07c743bc81 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 16 Jun 2025 13:01:14 +0000 Subject: [PATCH 1/3] don`t raise error from check _check_enqueue_strategy --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 192d34091f..bead51139d 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -967,7 +967,7 @@ def _check_enqueue_strategy( if strategy == 'same-domain': if origin_url.hostname is None or target_url.hostname is None: - raise ValueError('Both origin and target URLs must have a hostname') + return False origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain target_domain = self._tld_extractor.extract_str(target_url.hostname).domain From b06444ffc72e6dc643736c917b5636633420212e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 16 Jun 2025 13:47:21 +0000 Subject: [PATCH 2/3] update _check_enqueue_strategy --- src/crawlee/crawlers/_basic/_basic_crawler.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index bead51139d..01f4ff1a29 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -962,13 +962,20 @@ def _check_enqueue_strategy( origin_url: ParseResult, ) -> bool: """Check if a URL matches the enqueue_strategy.""" + if strategy == 'all': + return True + + if origin_url.hostname is None or target_url.hostname is None: + self.log.debug( + f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or ' + f'target_url = {target_url.geturl()}' + ) + return False + if strategy == 'same-hostname': return target_url.hostname == origin_url.hostname if strategy == 'same-domain': - if origin_url.hostname is None or target_url.hostname is None: - return False - origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain target_domain = self._tld_extractor.extract_str(target_url.hostname).domain return origin_domain == target_domain @@ -980,9 +987,6 @@ def _check_enqueue_strategy( and target_url.port == origin_url.port ) - if strategy == 'all': - return True - assert_never(strategy) def _check_url_patterns( From 1d9b2820e977e60b92889fca48779245ac978a7a Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 16 Jun 2025 18:03:58 +0000 Subject: [PATCH 3/3] add warning message --- src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 01f4ff1a29..dd038af1b9 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -941,12 +941,25 @@ def _enqueue_links_filter_iterator( """Filter requests based on the enqueue strategy and URL patterns.""" limit = kwargs.get('limit') parsed_origin_url = urlparse(origin_url) + strategy = kwargs.get('strategy', 'all') + + if strategy == 'all' and not parsed_origin_url.hostname: + self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.') + return + + # Emit a `warning` message to the log, only once per call + warning_flag = True for request in request_iterator: target_url = request.url if isinstance(request, Request) else request + parsed_target_url = urlparse(target_url) + + if warning_flag and strategy != 'all' and not parsed_target_url.hostname: + self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.') + warning_flag = False if self._check_enqueue_strategy( - kwargs.get('strategy', 'all'), target_url=urlparse(target_url), origin_url=parsed_origin_url + strategy, target_url=parsed_target_url, origin_url=parsed_origin_url ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')): yield request