Skip to content

Commit

Permalink
fixes #3
Browse files Browse the repository at this point in the history
  • Loading branch information
jph00 committed Sep 12, 2024
1 parent bcc5f07 commit 98995b3
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 2 deletions.
42 changes: 42 additions & 0 deletions 00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
"source": [
"#| export\n",
"from fastcore.utils import *\n",
"from fastcore.meta import delegates\n",
"import uuid, warnings\n",
"\n",
"from playwright.async_api import async_playwright, TimeoutError as PTimeoutError\n",
"from playwright_stealth import stealth_async\n",
"from anyio import from_thread\n",
"from httpx import get\n",
"\n",
"from bs4 import BeautifulSoup, GuessedAtParserWarning\n",
"from html2text import HTML2Text"
Expand Down Expand Up @@ -318,6 +320,46 @@
" return h2md(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b2c101b",
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"@delegates(get)\n",
"def get2md(url, sel=None, **kwargs):\n",
" \"Read `url` with `httpx.get`\"\n",
" warnings.filterwarnings(\"ignore\", category=GuessedAtParserWarning)\n",
" cts = get(url, **kwargs)\n",
" soup = BeautifulSoup(cts)\n",
" content = soup.select_one(sel)\n",
" return h2md(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3680fcd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Use the Public API\n",
"\n",
"The Railway public API is built with GraphQL and is the same API that powers the Railway dashboard\n"
]
}
],
"source": [
"url = 'https://docs.railway.app/guides/public-api'\n",
"md = get2md(url, \".docs-content\")\n",
"print(md[:120])"
]
},
{
"cell_type": "markdown",
"id": "474e14b4",
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,7 @@ print(md[856:1215])
## Application Command Object

###### Application Command Naming

If you don’t need JS-rendering or other fanciness, use
[`get2md`](https://AnswerDotAI.github.io/playwrightnb/core.html#get2md)
instead, which uses `httpx.get` instead of playwright.
7 changes: 7 additions & 0 deletions index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@
"print(md[856:1215])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you don't need JS-rendering or other fanciness, use `get2md` instead, which uses `httpx.get` instead of playwright."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
1 change: 1 addition & 0 deletions playwrightnb/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
'git_url': 'https://github.com/AnswerDotAI/playwrightnb',
'lib_path': 'playwrightnb'},
'syms': { 'playwrightnb.core': { 'playwrightnb.core.frames_ready': ('core.html#frames_ready', 'playwrightnb/core.py'),
'playwrightnb.core.get2md': ('core.html#get2md', 'playwrightnb/core.py'),
'playwrightnb.core.get_full_content': ('core.html#get_full_content', 'playwrightnb/core.py'),
'playwrightnb.core.get_page': ('core.html#get_page', 'playwrightnb/core.py'),
'playwrightnb.core.h2md': ('core.html#h2md', 'playwrightnb/core.py'),
Expand Down
14 changes: 13 additions & 1 deletion playwrightnb/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@

# %% auto 0
__all__ = ['get_page', 'page_ready', 'frames_ready', 'wait_page', 'get_full_content', 'read_page_async', 'read_page', 'h2md',
'url2md_async', 'url2md']
'url2md_async', 'url2md', 'get2md']

# %% ../00_core.ipynb
from fastcore.utils import *
from fastcore.meta import delegates
import uuid, warnings

from playwright.async_api import async_playwright, TimeoutError as PTimeoutError
from playwright_stealth import stealth_async
from anyio import from_thread
from httpx import get

from bs4 import BeautifulSoup, GuessedAtParserWarning
from html2text import HTML2Text
Expand Down Expand Up @@ -112,3 +114,13 @@ def url2md(url, sel=None, pause=50, timeout=5000, page=None):
soup = BeautifulSoup(cts)
content = soup.select_one(sel)
return h2md(content)

# %% ../00_core.ipynb
@delegates(get)
def get2md(url, sel=None, **kwargs):
"Read `url` with `httpx.get`"
warnings.filterwarnings("ignore", category=GuessedAtParserWarning)
cts = get(url, **kwargs)
soup = BeautifulSoup(cts)
content = soup.select_one(sel)
return h2md(content)
2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ version = 0.2.2
min_python = 3.8
license = apache2
black_formatting = False
requirements = fastcore playwright playwright-stealth anyio html2text beautifulsoup4
requirements = fastcore playwright playwright-stealth anyio html2text beautifulsoup4 httpx
conda_user = fastai
doc_path = _docs
lib_path = playwrightnb
Expand Down

0 comments on commit 98995b3

Please sign in to comment.