-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8c2fb55
Showing
4 changed files
with
235 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
_proc/ | ||
_quarto.yml | ||
sidebar.yml | ||
Gemfile.lock | ||
token | ||
_docs/ | ||
conda/ | ||
.last_checked | ||
.gitconfig | ||
*.bak | ||
*.log | ||
*~ | ||
~* | ||
_tmp* | ||
tmp* | ||
tags | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
env/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# dotenv | ||
.env | ||
|
||
# virtualenv | ||
.venv | ||
venv/ | ||
ENV/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
|
||
.vscode | ||
*.swp | ||
|
||
# osx generated files | ||
.DS_Store | ||
.DS_Store? | ||
.Trashes | ||
ehthumbs.db | ||
Thumbs.db | ||
.idea | ||
|
||
# pytest | ||
.pytest_cache | ||
|
||
# tools/trust-doc-nbs | ||
docs_src/.last_checked | ||
|
||
# symlinks to fastai | ||
docs_src/fastai | ||
tools/fastai | ||
|
||
# link checker | ||
checklink/cookies.txt | ||
|
||
# .gitconfig is now autogenerated | ||
.gitconfig | ||
|
||
_docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# playwrightnb | ||
|
||
This is a Python module that lets you use sync mode Playwright interactively, inside a Jupyter notebook. | ||
|
||
To use it: | ||
|
||
```python | ||
from playwrightnb import get_page | ||
page = get_page() | ||
page.goto('http://example.org'); | ||
# ... do things with `page`... | ||
page.stop() | ||
``` | ||
|
||
Unlike non-jupyter usage, you don't use a context manager, but instead use `stop` when you're done to close the browser session (or to be more precise -- internally we still use a context manager, but we patch it to not auto-close). `get_page` only returns the page object, not the browser or playwright objects. You should still be able to perform most common web scraping tasks effectively, such as: | ||
|
||
- Navigating to pages using `page.goto()` | ||
- Waiting for elements to load with `page.wait_for_selector()` | ||
- Extracting data from the page using methods like `page.query_selector()`, `page.query_selector_all()`, `page.text_content()`, etc. | ||
- Interacting with elements using `page.click()`, `page.type()`, `page.hover()`, and more | ||
- Evaluating JavaScript with `page.evaluate()` | ||
|
||
However, there are a few capabilities you might miss out on without direct access to the browser or playwright objects: | ||
|
||
1. Launching multiple browser contexts or pages simultaneously | ||
2. Configuring browser-specific settings or permissions | ||
3. Accessing browser-level methods like `browser.new_page()` or `browser.close()` | ||
|
||
For most scraping scenarios, having the page object alone should suffice. But if you need more advanced control, use the following code as a starting point (which is identical to the source of `get_page`): | ||
|
||
```python | ||
with sync_playwright() as p: | ||
browser = p.chromium.launch(*args, **kw) | ||
page = browser.new_context().new_page() | ||
page.stop = p.stop | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import nest_asyncio,asyncio | ||
from playwright.sync_api import PlaywrightContextManager, sync_playwright | ||
from fastcore.utils import patch | ||
|
||
nest_asyncio.apply() | ||
|
||
PlaywrightContextManager.orig_pcm = PlaywrightContextManager.__enter__ | ||
_orig_exit = PlaywrightContextManager.__exit__ | ||
|
||
@patch | ||
def __enter__(self:PlaywrightContextManager): | ||
def _exit(): _orig_exit(self) | ||
orig = asyncio.BaseEventLoop.is_running | ||
asyncio.BaseEventLoop.is_running = lambda self: False | ||
try: | ||
res = self.orig_pcm() | ||
res.stop = _exit | ||
return res | ||
finally: asyncio.BaseEventLoop.is_running = orig | ||
|
||
@patch | ||
def __exit__(self:PlaywrightContextManager, *args, **kwargs): pass | ||
|
||
def get_page(*args, **kw): | ||
"Get a new page in a Chromium browser, passing any arguments to `launch`" | ||
with sync_playwright() as p: | ||
page = p.chromium.launch(*args, **kw).new_context().new_page() | ||
page.stop = p.stop | ||
return page | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
[build-system] | ||
requires = ["setuptools>=42", "wheel"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project] | ||
name = "playwrightnb" | ||
version = "0.1.0" | ||
description = "Use sync mode Playwright interactively, inside a Jupyter notebook" | ||
readme = "README.md" | ||
license = { file = "LICENSE" } | ||
authors = [ | ||
{ name = "Jeremy Howard", email = "info@fast.ai" } | ||
] | ||
requires-python = ">=3.7, <4" | ||
dependencies = [ "nest-asyncio", "fastcore", "playwright" ] | ||
classifiers = [ | ||
"Development Status :: 3 - Alpha", | ||
"Intended Audience :: Developers", | ||
"License :: OSI Approved :: Apache Software License" | ||
] | ||
|