Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,15 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')

links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
links_iterator = to_absolute_url_iterator(
context.request.loaded_url or context.request.url, links_iterator, logger=context.log

# Get base URL from <base> tag if present
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
base_url: str = (
str(extracted_base_urls[0])
if extracted_base_urls
else context.request.loaded_url or context.request.url
)
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
Expand Down
9 changes: 6 additions & 3 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,12 @@ async def extract_links(
links_iterator: Iterator[str] = iter(
[url for element in elements if (url := await element.get_attribute('href')) is not None]
)
links_iterator = to_absolute_url_iterator(
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
)

# Get base URL from <base> tag if present
extracted_base_url = await context.page.evaluate('document.baseURI')
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url

links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -131,6 +134,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
str(server_url / 'sub_index'),
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'base_page'),
str(server_url / 'page_4'),
str(server_url / 'base_subpath/page_5'),
}

# # all urls added to `enqueue_links` must have a custom header
Expand Down Expand Up @@ -164,6 +170,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
assert visited == {
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -221,6 +229,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}


Expand Down
9 changes: 9 additions & 0 deletions tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -151,6 +154,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
str(server_url / 'sub_index'),
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_4'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}

# # all urls added to `enqueue_links` must have a custom header
Expand Down Expand Up @@ -258,6 +264,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
assert visited == {
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -315,6 +323,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}


Expand Down
6 changes: 6 additions & 0 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -668,6 +671,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert visited == {
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}


Expand Down Expand Up @@ -724,6 +729,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}


Expand Down
12 changes: 12 additions & 0 deletions tests/unit/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from yarl import URL

from tests.unit.server_endpoints import (
BASE_INDEX,
GENERIC_RESPONSE,
HELLO_WORLD,
INCAPSULA,
Expand Down Expand Up @@ -105,6 +106,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
'page_1': generic_response_endpoint,
'page_2': generic_response_endpoint,
'page_3': generic_response_endpoint,
'base_page': base_index_endpoint,
'problematic_links': problematic_links_endpoint,
'set_cookies': set_cookies,
'set_complex_cookies': set_complex_cookies,
Expand Down Expand Up @@ -431,6 +433,16 @@ async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, s
)


async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the base index page."""
host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
content = BASE_INDEX.format(host=host).encode()
await send_html_response(
send,
content,
)


class TestServer(Server):
"""A test HTTP server implementation based on Uvicorn Server."""

Expand Down
12 changes: 12 additions & 0 deletions tests/unit/server_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@
<body>
<a href="/page_3">Link 3</a>
<a href="/page_2">Link 4</a>
<a href="/base_page">Base Page</a>
</body></html>"""

BASE_INDEX = """\
<html><head>
<base href="{host}/base_subpath/">
<base href="{host}/sub_index/">
<title>Hello</title>
</head>
<body>
<a href="page_5">Link 5</a>
<a href="/page_4">Link 6</a>
</body></html>"""

INCAPSULA = b"""\
Expand Down
Loading