diff --git a/src/crawlee/browsers/_browser_controller.py b/src/crawlee/browsers/_browser_controller.py index b04fb77128..61e8940d5c 100644 --- a/src/crawlee/browsers/_browser_controller.py +++ b/src/crawlee/browsers/_browser_controller.py @@ -26,6 +26,11 @@ class BrowserController(ABC): def pages(self) -> list[Page]: """Return the list of opened pages.""" + @property + @abstractmethod + def total_opened_pages(self) -> int: + """Return the total number of pages opened since the browser was launched.""" + @property @abstractmethod def pages_count(self) -> int: diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 5918064271..7c847a9603 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -53,6 +53,7 @@ def __init__( browser_inactive_threshold: timedelta = timedelta(seconds=10), identify_inactive_browsers_interval: timedelta = timedelta(seconds=20), close_inactive_browsers_interval: timedelta = timedelta(seconds=30), + retire_browser_after_page_count: int = 100, ) -> None: """Initialize a new instance. @@ -67,7 +68,10 @@ def __init__( as retired. close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers and closes them. The browser is considered as inactive if it has no active pages and has been idle - for the specified period. + for the specified period. The browser is considered as retired if it has no active pages and has total + pages count greater than or equal to `retire_browser_after_page_count`. + retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered + as retired. """ self._plugins = plugins or [PlaywrightBrowserPlugin()] self._operation_timeout = operation_timeout @@ -91,6 +95,7 @@ def __init__( ) self._total_pages_count = 0 + self._retire_browser_after_page_count = retire_browser_after_page_count self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins @@ -305,6 +310,9 @@ async def _get_new_page( except RuntimeError as exc: raise RuntimeError('Browser pool is not initialized.') from exc + if browser_controller.total_opened_pages >= self._retire_browser_after_page_count: + self._retire_browser(browser_controller) + crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type) self._pages[page_id] = crawlee_page self._total_pages_count += 1 @@ -321,6 +329,12 @@ def _pick_browser_with_free_capacity( return None + def _retire_browser(self, browser: BrowserController) -> None: + """Retire a browser by moving it to the inactive list.""" + if browser in self._active_browsers: + self._active_browsers.remove(browser) + self._inactive_browsers.append(browser) + async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController: """Launch a new browser instance using the specified plugin.""" browser = await plugin.new_browser() diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 6ba848503b..01650c434a 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -74,11 +74,18 @@ def __init__( self._pages = list[Page]() self._last_page_opened_at = datetime.now(timezone.utc) + self._total_opened_pages = 0 + @property @override def pages(self) -> list[Page]: return self._pages + @property + @override + def total_opened_pages(self) -> int: + return self._total_opened_pages + @property @override def pages_count(self) -> int: @@ -160,6 +167,8 @@ async def new_page( self._pages.append(page) self._last_page_opened_at = datetime.now(timezone.utc) + self._total_opened_pages += 1 + return page @override diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py index 2bb19bfe0d..08b3f31aa3 100644 --- a/tests/unit/browsers/test_browser_pool.py +++ b/tests/unit/browsers/test_browser_pool.py @@ -160,3 +160,29 @@ async def test_with_plugin_contains_page_options(server_url: URL) -> None: await test_page.page.goto(str(server_url / 'user-agent')) assert 'My Best User-Agent' in await test_page.page.content() await test_page.page.close() + + +@pytest.mark.parametrize( + ('retire_after_page_count', 'expect_equal_browsers'), + [ + pytest.param(2, True, id='Two pages opened in the same browser'), + pytest.param(1, False, id='Each page opened in a new browser.'), + ], +) +async def test_browser_pool_retire_browser_after_page_count( + retire_after_page_count: int, *, expect_equal_browsers: bool +) -> None: + async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool: + test_page = await browser_pool.new_page() + first_browser = test_page.page.context + await test_page.page.close() + + test_page = await browser_pool.new_page() + second_browser = test_page.page.context + + await test_page.page.close() + + if expect_equal_browsers: + assert first_browser is second_browser + else: + assert first_browser is not second_browser