From 9cef8a06b9f639faad1dbb739ba28877aae4b5a2 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Tue, 18 Sep 2018 16:37:31 -0400 Subject: [PATCH 01/10] Fix merge errors on HTMLSession --- requests_html.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requests_html.py b/requests_html.py index 230f079..0392ead 100644 --- a/requests_html.py +++ b/requests_html.py @@ -646,7 +646,7 @@ class HTMLSession(requests.Session): amongst other things. """ - def __init__(self, mock_browser=True, verify=False): + def __init__(self, mock_browser=True, verify=False, browser_args=['--no-sandbox']): super(HTMLSession, self).__init__() # Mock a web browser's user agent. @@ -654,7 +654,6 @@ def __init__(self, mock_browser=True, verify=False): self.headers['User-Agent'] = user_agent() self.hooks = {'response': self._handle_response} - self.ignoreHTTPSErrors = ignoreHTTPSErrors self.__browser_args = browser_args @@ -681,7 +680,7 @@ def request(self, *args, **kwargs) -> HTMLResponse: def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() - self._browser = self.loop.run_until_complete(pyppeteer.launch(ignoreHTTPSErrors=self.verify, headless=True, args=['--no-sandbox'])) + self._browser = self.loop.run_until_complete(pyppeteer.launch(ignoreHTTPSErrors=self.verify, headless=True, args=self.__browser_args)) return self._browser def close(self): From 2e460d93c3a493e30aedd0201f6a21a4d3cc8e16 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 11:36:18 -0400 Subject: [PATCH 02/10] Create a base session --- requests_html.py | 51 +++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/requests_html.py b/requests_html.py index 0392ead..4b04105 100644 --- a/requests_html.py +++ b/requests_html.py @@ -641,46 +641,46 @@ def _get_first_or_list(l, first=False): return l -class HTMLSession(requests.Session): - """A consumable session, for cookie persistence and connection pooling, +class BaseSession(requests.Session): + """ A consumable session, for cookie persistence and connection pooling, amongst other things. """ - def __init__(self, mock_browser=True, verify=False, browser_args=['--no-sandbox']): - super(HTMLSession, self).__init__() + def __init__(self, mock_browser : bool = True, verify : bool = False, + browser_args : list = ['--no-sandbox']): + super().__init__() # Mock a web browser's user agent. if mock_browser: self.headers['User-Agent'] = user_agent() - self.hooks = {'response': self._handle_response} + self.hooks['response'].append(self.response_hook) self.__browser_args = browser_args - @staticmethod - def _handle_response(response, **kwargs) -> HTMLResponse: - """Requests HTTP Response handler. Attaches .html property to - class:`requests.Response ` objects. - """ + def response_hook(self, response, **kwargs) -> HTMLResponse: + """ Change response enconding and replace it by a HTMLResponse. """ if not response.encoding: response.encoding = DEFAULT_ENCODING + return HTMLResponse._from_response(response, self) - return response + @property + async def browser(self): + if not hasattr(self, "_browser"): + self._browser = await pyppeteer.launch(ignoreHTTPSErrors=self.verify, headless=True, args=self.__browser_args) + return self._browser - def request(self, *args, **kwargs) -> HTMLResponse: - """Makes an HTTP Request, with mocked User–Agent headers. - Returns a class:`HTTPResponse `. - """ - # Convert Request object into HTTPRequest object. - r = super(HTMLSession, self).request(*args, **kwargs) - return HTMLResponse._from_response(r, self) +class HTMLSession(BaseSession): + + def __init__(self, **kwargs): + super(HTMLSession, self).__init__(**kwargs) @property def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() - self._browser = self.loop.run_until_complete(pyppeteer.launch(ignoreHTTPSErrors=self.verify, headless=True, args=self.__browser_args)) + self._browser = self.loop.run_until_complete(super().browser) return self._browser def close(self): @@ -690,7 +690,7 @@ def close(self): super().close() -class AsyncHTMLSession(requests.Session): +class AsyncHTMLSession(BaseSession): """ An async consumable session. """ def __init__(self, loop=None, workers=None, @@ -703,20 +703,9 @@ def __init__(self, loop=None, workers=None, machine, multiplied by 5. """ super().__init__(*args, **kwargs) - # Mock a web browser's user agent. - if mock_browser: - self.headers['User-Agent'] = user_agent() - - self.hooks['response'].append(self.response_hook) - self.loop = loop or asyncio.get_event_loop() self.thread_pool = ThreadPoolExecutor(max_workers=workers) - def response_hook(self, response, **kwargs) -> HTMLResponse: - """ Change response enconding and replace it by a HTMLResponse. """ - response.encoding = DEFAULT_ENCODING - return HTMLResponse._from_response(response, self) - def request(self, *args, **kwargs): """ Partial original request func and run it in a thread. """ func = partial(super().request, *args, **kwargs) From dd05a02de75439f8c947ec15d456e4ef2a4ad948 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 12:05:12 -0400 Subject: [PATCH 03/10] Add HTMLSession.browser runtime exception, AsyncSession an async close method --- requests_html.py | 8 ++++++++ tests/test_requests_html.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/requests_html.py b/requests_html.py index 4b04105..5bb1768 100644 --- a/requests_html.py +++ b/requests_html.py @@ -680,6 +680,8 @@ def __init__(self, **kwargs): def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() + if self.loop.is_running(): + raise RuntimeError("Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.") self._browser = self.loop.run_until_complete(super().browser) return self._browser @@ -710,3 +712,9 @@ def request(self, *args, **kwargs): """ Partial original request func and run it in a thread. """ func = partial(super().request, *args, **kwargs) return self.loop.run_in_executor(self.thread_pool, func) + + async def close(self): + """ If a browser was created close it first. """ + if hasattr(self, "_browser"): + await self._browser.close() + super().close() diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 129bd5e..5d583dd 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -247,6 +247,15 @@ def test_browser_session(): # assert count_chromium_process() == 0 +@pytest.mark.ok +@pytest.mark.asyncio +async def test_browser_session_fail(): + """ HTMLSession.browser should not be call within an existing event loop> """ + session = HTMLSession() + with pytest.raises(RuntimeError): + session.browser + + @pytest.mark.ok def test_browser_process(): for _ in range(3): @@ -256,6 +265,14 @@ def test_browser_process(): assert r.html.page == None +@pytest.mark.ok +@pytest.mark.asyncio +async def test_async_browser_session(): + session = AsyncHTMLSession() + browser = await session.browser + assert isinstance(browser, Browser) + await session.close() + if __name__ == '__main__': test_containing() From c12d7c6acaab20648f82eabb00f15f1c2273e61e Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 15:39:56 -0400 Subject: [PATCH 04/10] Add async iterator to HTML class --- requests_html.py | 11 +++++++++++ tests/test_internet.py | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 5bb1768..e2a66f1 100644 --- a/requests_html.py +++ b/requests_html.py @@ -485,6 +485,17 @@ def __iter__(self): def __next__(self): return self._next(fetch=True, next_symbol=self.next_symbol).html + def __aiter__(self): + return self + + async def __anext__(self): + while True: + url = self._next(fetch=False, next_symbol=self.next_symbol) + if not url: + break + response = await self.session.get(url) + return response.html + def add_next_symbol(self, next_symbol): self.next_symbol.append(next_symbol) diff --git a/tests/test_internet.py b/tests/test_internet.py index 19527bb..8ab99b6 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -1,7 +1,9 @@ -from requests_html import HTMLSession +import pytest +from requests_html import HTMLSession, AsyncHTMLSession session = HTMLSession() + def test_pagination(): pages = ( 'https://xkcd.com/1957/', @@ -14,3 +16,17 @@ def test_pagination(): r = session.get(page) assert next(r.html) + +@pytest.mark.asyncio +async def test_pagination(event_loop): + asession = AsyncHTMLSession() + pages = ( + 'https://xkcd.com/1957/', + 'https://reddit.com/', + 'https://smile.amazon.com/', + 'https://theverge.com/archives' + ) + + for page in pages: + r = await asession.get(page) + assert await r.html.__anext__() From 85e77d134a7595d9a2caf64a5fa166df11de458e Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 18:34:18 -0400 Subject: [PATCH 05/10] Add arender method to HTML --- requests_html.py | 103 +++++++++++++++++++++++------------- tests/test_requests_html.py | 22 ++++++++ 2 files changed, 88 insertions(+), 37 deletions(-) diff --git a/requests_html.py b/requests_html.py index e2a66f1..dea4664 100644 --- a/requests_html.py +++ b/requests_html.py @@ -499,6 +499,45 @@ async def __anext__(self): def add_next_symbol(self, next_symbol): self.next_symbol.append(next_symbol) + async def _async_render(self, *, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool): + """ Handle page creation and js rendering. Internal use for render/arender methods. """ + try: + page = await self.browser.newPage() + + # Wait before rendering the page, to prevent timeouts. + await asyncio.sleep(wait) + + # Load the given page (GET request, obviously.) + if reload: + await page.goto(url, options={'timeout': int(timeout * 1000)}) + else: + await page.goto(f'data:text/html,{self.html}', options={'timeout': int(timeout * 1000)}) + + result = None + if script: + result = await page.evaluate(script) + + if scrolldown: + for _ in range(scrolldown): + await page._keyboard.down('PageDown') + await asyncio.sleep(sleep) + else: + await asyncio.sleep(sleep) + + if scrolldown: + await page._keyboard.up('PageDown') + + # Return the content of the page, JavaScript evaluated. + content = await page.content() + if not keep_page: + await page.close() + page = None + return content, result, page + except TimeoutError: + await page.close() + page = None + return None + def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): """Reloads the response in Chromium, and replaces HTML content with an updated version, with JavaScript executed. @@ -543,45 +582,35 @@ def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scroll Warning: the first time you run this method, it will download Chromium into your home directory (``~/.pyppeteer``). """ - async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool): - try: - page = await self.session.browser.newPage() - - # Wait before rendering the page, to prevent timeouts. - await asyncio.sleep(wait) - - # Load the given page (GET request, obviously.) - if reload: - await page.goto(url, options={'timeout': int(timeout * 1000)}) - else: - await page.goto(f'data:text/html,{self.html}', options={'timeout': int(timeout * 1000)}) - - result = None - if script: - result = await page.evaluate(script) - - if scrolldown: - for _ in range(scrolldown): - await page._keyboard.down('PageDown') - await asyncio.sleep(sleep) - else: - await asyncio.sleep(sleep) - if scrolldown: - await page._keyboard.up('PageDown') + self.browser = self.session.browser # Automatycally create a event loop and browser + content = None - # Return the content of the page, JavaScript evaluated. - content = await page.content() - if not keep_page: - await page.close() - page = None - return content, result, page - except TimeoutError: - await page.close() - page = None - return None + # Automatically set Reload to False, if example URL is being used. + if self.url == DEFAULT_URL: + reload = False + + for i in range(retries): + if not content: + try: + + content, result, page = self.session.loop.run_until_complete(self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) + except TypeError: + pass + else: + break + + if not content: + raise MaxRetries("Unable to render the page. Try increasing timeout") + + html = HTML(url=self.url, html=content.encode(DEFAULT_ENCODING), default_encoding=DEFAULT_ENCODING) + self.__dict__.update(html.__dict__) + self.page = page + return result - self.session.browser # Automatically create a event loop and browser + async def arender(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): + """ Async version of render. Takes same parameters. """ + self.browser = await self.session.browser content = None # Automatically set Reload to False, if example URL is being used. @@ -592,7 +621,7 @@ async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, if not content: try: - content, result, page = self.session.loop.run_until_complete(_async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) + content, result, page = await self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page) except TypeError: pass else: diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 5d583dd..4a40c2d 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -193,6 +193,28 @@ def test_render(): assert len(about.links) == 6 +@pytest.mark.render +@pytest.mark.asyncio +async def test_async_render(async_get): + r = await async_get() + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = await r.html.arender(script=script) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + about = r.html.find('#about', first=True) + assert len(about.links) == 6 + await r.html.browser.close() + + @pytest.mark.render def test_bare_render(): doc = """""" From fc1fabd8dc09d22b0b37f1262b79436135ccaf33 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 18:46:57 -0400 Subject: [PATCH 06/10] Fix HTML class to use async iter and render on bare mode --- requests_html.py | 5 ++-- tests/test_requests_html.py | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/requests_html.py b/requests_html.py index dea4664..0010093 100644 --- a/requests_html.py +++ b/requests_html.py @@ -410,7 +410,7 @@ class HTML(BaseParser): :param default_encoding: Which encoding to default to. """ - def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING) -> None: + def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING, async_: bool = False) -> None: # Convert incoming unicode HTML into bytes. if isinstance(html, str): @@ -423,7 +423,7 @@ def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url=url, default_encoding=default_encoding ) - self.session = session or HTMLSession() + self.session = session or async_ and AsyncHTMLSession() or HTMLSession() self.page = None self.next_symbol = DEFAULT_NEXT_SYMBOL @@ -610,6 +610,7 @@ def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scroll async def arender(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False): """ Async version of render. Takes same parameters. """ + self.browser = await self.session.browser content = None diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 4a40c2d..0c9fe0f 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -236,6 +236,29 @@ def test_bare_render(): assert 'https://httpbin.org' in html.links +@pytest.mark.render +@pytest.mark.asyncio +async def test_bare_arender(): + doc = """""" + html = HTML(html=doc, async_=True) + script = """ + () => { + return { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + deviceScaleFactor: window.devicePixelRatio, + } + } + """ + val = await html.arender(script=script, reload=False) + for value in ('width', 'height', 'deviceScaleFactor'): + assert value in val + + assert html.find('html') + assert 'https://httpbin.org' in html.links + await html.browser.close() + + @pytest.mark.render def test_bare_js_eval(): doc = """ @@ -257,6 +280,29 @@ def test_bare_js_eval(): assert html.find('#replace', first=True).text == 'yolo' +@pytest.mark.render +@pytest.mark.asyncio +async def test_bare_js_async_eval(): + doc = """ + + + +
This gets replaced
+ + + + + """ + + html = HTML(html=doc, async_=True) + await html.arender() + + assert html.find('#replace', first=True).text == 'yolo' + await html.browser.close() + + @pytest.mark.ok def test_browser_session(): """ Test browser instaces is created and properly close when session is closed. From 09c7b683cc5ecbb3a1705c9fee0c83fce7bfd8b7 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 19:06:37 -0400 Subject: [PATCH 07/10] Fix r.html.next() for next url --- requests_html.py | 8 ++++---- tests/test_internet.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requests_html.py b/requests_html.py index 0010093..61893ae 100644 --- a/requests_html.py +++ b/requests_html.py @@ -430,7 +430,7 @@ def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, def __repr__(self) -> str: return f"" - def _next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: + def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: """Attempts to find the next page, if there is one. If ``fetch`` is ``True`` (default), returns :class:`HTML ` object of next page. If ``fetch`` is ``False``, simply returns the next URL. @@ -478,19 +478,19 @@ def __iter__(self): while True: yield next try: - next = next._next(fetch=True, next_symbol=self.next_symbol).html + next = next.next(fetch=True, next_symbol=self.next_symbol).html except AttributeError: break def __next__(self): - return self._next(fetch=True, next_symbol=self.next_symbol).html + return self.next(fetch=True, next_symbol=self.next_symbol).html def __aiter__(self): return self async def __anext__(self): while True: - url = self._next(fetch=False, next_symbol=self.next_symbol) + url = self.next(fetch=False, next_symbol=self.next_symbol) if not url: break response = await self.session.get(url) diff --git a/tests/test_internet.py b/tests/test_internet.py index 8ab99b6..9bdc51a 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -18,7 +18,7 @@ def test_pagination(): @pytest.mark.asyncio -async def test_pagination(event_loop): +async def test_async_pagination(event_loop): asession = AsyncHTMLSession() pages = ( 'https://xkcd.com/1957/', From 69dd1cc77fbc0d5ba1383a86f0d5abdb39eb2c70 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 19:39:51 -0400 Subject: [PATCH 08/10] Add asyncsession.run method --- requests_html.py | 10 ++++++++++ tests/test_internet.py | 20 +++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 61893ae..b301cd6 100644 --- a/requests_html.py +++ b/requests_html.py @@ -759,3 +759,13 @@ async def close(self): if hasattr(self, "_browser"): await self._browser.close() super().close() + + def run(self, *coros): + """ Pass in all the coroutines you want to run, it will wrap each one + in a task, run it and wait for the result. Retuen a list with all + results, this are returned in the same order coros are passed in. """ + tasks = [ + asyncio.ensure_future(coro()) for coro in coros + ] + done, _ = self.loop.run_until_complete(asyncio.wait(tasks)) + return [t.result() for t in done] diff --git a/tests/test_internet.py b/tests/test_internet.py index 9bdc51a..8cf6fb9 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -1,5 +1,5 @@ import pytest -from requests_html import HTMLSession, AsyncHTMLSession +from requests_html import HTMLSession, AsyncHTMLSession, HTMLResponse session = HTMLSession() @@ -30,3 +30,21 @@ async def test_async_pagination(event_loop): for page in pages: r = await asession.get(page) assert await r.html.__anext__() + + +def test_async_run(): + asession = AsyncHTMLSession() + + async def test1(): + return await asession.get('https://xkcd.com/1957/') + + async def test2(): + return await asession.get('https://reddit.com/') + + async def test3(): + return await asession.get('https://smile.amazon.com/') + + r = asession.run(test1, test2, test3) + + assert len(r) == 3 + assert isinstance(r[0], HTMLResponse) From 6ef9c3478a0228b7251609e86b5ef59fea59dda2 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Wed, 21 Mar 2018 19:52:11 -0400 Subject: [PATCH 09/10] Update docs --- README.rst | 30 ++++++++++++++++++++ docs/source/index.rst | 64 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/README.rst b/README.rst index 141abba..a55a984 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,7 @@ When using this library you automatically get: - Automatic following of redirects. - Connection–pooling and cookie persistence. - The Requests experience you know and love, with magical parsing abilities. +- **Async Support** .. Other nice features include: @@ -38,6 +39,24 @@ Make a GET request to 'python.org', using Requests: >>> r = session.get('https://python.org/') +Try async and get some sites at the same time: + +.. code-block:: pycon + + >>> from requests_html import AsyncHTMLSession + >>> asession = AsyncHTMLSession() + + >>> async def get_pythonorg(): + ... r = await asession.get('https://python.org/') + + >>> async def get_reddit(): + ... r = await asession.get('https://reddit.com/') + + >>> async def get_google(): + ... r = await asession.get('https://google.com/') + + >>> result = session.run(get_pythonorg, get_reddit, get_google) + Grab a list of all links on the page, as–is (anchors excluded): .. code-block:: pycon @@ -140,6 +159,17 @@ Let's grab some text that's rendered by JavaScript: >>> r.html.search('Python 2 will retire in only {months} months!')['months'] '' +Or you can do this async also: + +.. code-block:: pycon + + >>> r = asession.get('http://python-requests.org/') + + >>> await r.html.arender() + + >>> r.html.search('Python 2 will retire in only {months} months!')['months'] + '' + Note, the first time you ever run the ``render()`` method, it will download Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens once. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1549718..bcfcef5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -28,6 +28,7 @@ When using this library you automatically get: - Automatic following of redirects. - Connection–pooling and cookie persistence. - The Requests experience you know and love, with magical parsing abilities. +- **Async Support** .. Other nice features include: @@ -57,6 +58,33 @@ Make a GET request to `python.org `_, using `Requests >> r = session.get('https://python.org/') +Or want to try our async session: + +.. code-block:: pycon + + >>> from requests_html import AsyncHTMLSession + >>> asession = AsyncHTMLSession() + + >>> r = await asession.get('https://python.org/') + +But async is fun when fetching some sites at the same time: + +.. code-block:: pycon + + >>> from requests_html import AsyncHTMLSession + >>> asession = AsyncHTMLSession() + + >>> async def get_pythonorg(): + ... r = await asession.get('https://python.org/') + + >>> async def get_reddit(): + ... r = await asession.get('https://reddit.com/') + + >>> async def get_google(): + ... r = await asession.get('https://google.com/') + + >>> session.run(get_pythonorg, get_reddit, get_google) + Grab a list of all links on the page, as–is (anchors excluded): .. code-block:: pycon @@ -179,6 +207,17 @@ Let's grab some text that's rendered by JavaScript: >>> r.html.search('Python 2 will retire in only {months} months!')['months'] '' +Or you can do this async also: + +.. code-block:: pycon + + >>> r = asession.get('http://python-requests.org/') + + >>> await r.html.arender() + + >>> r.html.search('Python 2 will retire in only {months} months!')['months'] + '' + Note, the first time you ever run the ``render()`` method, it will download Chromium into your home directory (e.g. ``~/.pyppeteer/``). This only happens once. You may also need to install a few `Linux packages `_ to get pyppeteer working. @@ -202,6 +241,17 @@ There's also intelligent pagination support (always improving): … +For `async` pagination use the new `async for`: + +.. code-block:: pycon + + >>> r = await asession.get('https://reddit.com') + >>> async for html in r.html: + ... print(html) + + + … + You can also just request the next URL easily: .. code-block:: pycon @@ -246,6 +296,16 @@ You can also render JavaScript pages without Requests: >>> print(html.html)
+For using `arender` just pass `async_=True` to HTML. + +.. code-block:: pycon + + # ^^ using above script ^^ + >>> html = HTML(html=doc, async_=True) + >>> val = await html.arender(script=script, reload=False) + >>> print(val) + {'width': 800, 'height': 600, 'deviceScaleFactor': 1} + API Documentation ================= @@ -278,6 +338,10 @@ These sessions are for making HTTP requests: :inherited-members: +.. autoclass:: AsyncHTMLSession + :inherited-members: + + Indices and tables ================== From 334821514bbb140437e2059043b4db4783eee385 Mon Sep 17 00:00:00 2001 From: Ordanis Sanchez Date: Tue, 18 Sep 2018 17:10:34 -0400 Subject: [PATCH 10/10] Fix tests --- tests/test_internet.py | 2 -- tests/test_requests_html.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_internet.py b/tests/test_internet.py index 8cf6fb9..36ca383 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -7,7 +7,6 @@ def test_pagination(): pages = ( 'https://xkcd.com/1957/', - 'https://reddit.com/', 'https://smile.amazon.com/', 'https://theverge.com/archives' ) @@ -22,7 +21,6 @@ async def test_async_pagination(event_loop): asession = AsyncHTMLSession() pages = ( 'https://xkcd.com/1957/', - 'https://reddit.com/', 'https://smile.amazon.com/', 'https://theverge.com/archives' ) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 0c9fe0f..05c2bf7 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -79,7 +79,7 @@ def test_containing(): r = get() python = r.html.find(containing='python') - assert len(python) == 191 + assert len(python) == 192 for e in python: assert 'python' in e.full_text.lower()