Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/crawlee/browsers/_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ class BrowserController(ABC):
def pages(self) -> list[Page]:
"""Return the list of opened pages."""

@property
@abstractmethod
def total_opened_pages(self) -> int:
"""Return the total number of pages opened since the browser was launched."""

@property
@abstractmethod
def pages_count(self) -> int:
Expand Down
16 changes: 15 additions & 1 deletion src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
browser_inactive_threshold: timedelta = timedelta(seconds=10),
identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),
close_inactive_browsers_interval: timedelta = timedelta(seconds=30),
retire_browser_after_page_count: int = 100,
) -> None:
"""Initialize a new instance.

Expand All @@ -67,7 +68,10 @@ def __init__(
as retired.
close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers
and closes them. The browser is considered as inactive if it has no active pages and has been idle
for the specified period.
for the specified period. The browser is considered as retired if it has no active pages and has total
pages count greater than or equal to `retire_browser_after_page_count`.
retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered
as retired.
"""
self._plugins = plugins or [PlaywrightBrowserPlugin()]
self._operation_timeout = operation_timeout
Expand All @@ -91,6 +95,7 @@ def __init__(
)

self._total_pages_count = 0
self._retire_browser_after_page_count = retire_browser_after_page_count
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

Expand Down Expand Up @@ -305,6 +310,9 @@ async def _get_new_page(
except RuntimeError as exc:
raise RuntimeError('Browser pool is not initialized.') from exc

if browser_controller.total_opened_pages >= self._retire_browser_after_page_count:
self._retire_browser(browser_controller)

crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
self._pages[page_id] = crawlee_page
self._total_pages_count += 1
Expand All @@ -321,6 +329,12 @@ def _pick_browser_with_free_capacity(

return None

def _retire_browser(self, browser: BrowserController) -> None:
"""Retire a browser by moving it to the inactive list."""
if browser in self._active_browsers:
self._active_browsers.remove(browser)
self._inactive_browsers.append(browser)

async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
"""Launch a new browser instance using the specified plugin."""
browser = await plugin.new_browser()
Expand Down
9 changes: 9 additions & 0 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,18 @@ def __init__(
self._pages = list[Page]()
self._last_page_opened_at = datetime.now(timezone.utc)

self._total_opened_pages = 0

@property
@override
def pages(self) -> list[Page]:
return self._pages

@property
@override
def total_opened_pages(self) -> int:
return self._total_opened_pages

@property
@override
def pages_count(self) -> int:
Expand Down Expand Up @@ -160,6 +167,8 @@ async def new_page(
self._pages.append(page)
self._last_page_opened_at = datetime.now(timezone.utc)

self._total_opened_pages += 1

return page

@override
Expand Down
26 changes: 26 additions & 0 deletions tests/unit/browsers/test_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,29 @@ async def test_with_plugin_contains_page_options(server_url: URL) -> None:
await test_page.page.goto(str(server_url / 'user-agent'))
assert 'My Best User-Agent' in await test_page.page.content()
await test_page.page.close()


@pytest.mark.parametrize(
('retire_after_page_count', 'expect_equal_browsers'),
[
pytest.param(2, True, id='Two pages opened in the same browser'),
pytest.param(1, False, id='Each page opened in a new browser.'),
],
)
async def test_browser_pool_retire_browser_after_page_count(
retire_after_page_count: int, *, expect_equal_browsers: bool
) -> None:
async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool:
test_page = await browser_pool.new_page()
first_browser = test_page.page.context
await test_page.page.close()

test_page = await browser_pool.new_page()
second_browser = test_page.page.context

await test_page.page.close()

if expect_equal_browsers:
assert first_browser is second_browser
else:
assert first_browser is not second_browser
Loading