Skip to content

Commit

Permalink
Merge pull request #285 from openzim/crawler_beta5
Browse files Browse the repository at this point in the history
Upgrade browsertrix crawler and remove redirect handling
  • Loading branch information
benoit74 committed Mar 7, 2024
2 parents c2dc8c5 + 5c71674 commit 867d14f
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 47 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
- Adopt Python bootstrap conventions
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
- Upgrade to Python 3.12 + upgrade dependencies

## [1.6.3] - 2024-01-18

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:0.12.4
FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
LABEL org.opencontainers.image.source https://github.com/openzim/zimit

# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
Expand Down
48 changes: 6 additions & 42 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

import inotify
import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
Expand Down Expand Up @@ -393,7 +391,7 @@ def run(raw_args):
user_agent += f" {zimit_args.adminEmail}"

if url:
url = check_url(url, user_agent, zimit_args.scopeType)
url = get_cleaned_url(url)
warc2zim_args.append("--url")
warc2zim_args.append(url)

Expand Down Expand Up @@ -509,48 +507,14 @@ def cleanup():
return warc2zim(warc2zim_args)


def check_url(url: str, user_agent: str, scope: str | None = None):
def get_cleaned_url(url: str):
parsed_url = urllib.parse.urlparse(url)
try:
with requests.get(
parsed_url.geturl(),
stream=True,
allow_redirects=True,
timeout=(12.2, 27),
headers={"User-Agent": user_agent},
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}")
raise SystemExit(1) from None
actual_url = urllib.parse.urlparse(resp.url)

# remove explicit port in URI for default-for-scheme as browsers does it
if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")
if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")

if actual_url.geturl() != parsed_url.geturl():
if scope in (None, "any"):
return actual_url.geturl()

logger.info(
"[WARN] Your URL ({}) redirects to {} which {} on same "
"first-level domain. Depending on your scopeType ({}), "
"your homepage might be out-of-scope. Please check!".format(
parsed_url.geturl(),
actual_url.geturl(),
(
"is"
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
else "is not"
),
scope,
)
)

return actual_url.geturl()
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")

return parsed_url.geturl()

Expand Down
8 changes: 4 additions & 4 deletions tests-integration/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def test_stats_output():
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 8,
"total": 8,
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 8,
"total": 8,
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}

0 comments on commit 867d14f

Please sign in to comment.