Skip to content

Commit

Permalink
Merge pull request #292 from openzim/ua_not_mandatory
Browse files Browse the repository at this point in the history
Change crawler default settings around userAgent and mobileDevice
  • Loading branch information
rgaudin authored Mar 27, 2024
2 parents 5473269 + 728784d commit f637c3f
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 20 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- New `--version` flag to display Zimit version
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
- New `--noMobileDevice` CLI argument

### Changed

Expand All @@ -21,6 +22,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Adopt Python bootstrap conventions
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
- Upgrade to Python 3.12 + upgrade dependencies
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
- `--userAgent` CLI arguement is not mandatory anymore
- Upgraded Browsertrix Crawler to 1.0.3

### Fixed

- Fix support for Youtube videos (#291)

## [1.6.3] - 2024-01-18

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
FROM webrecorder/browsertrix-crawler:1.0.3
LABEL org.opencontainers.image.source https://github.com/openzim/zimit

# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
Expand Down
37 changes: 21 additions & 16 deletions src/zimit/zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,6 @@

from zimit.__about__ import __version__

DEFAULT_USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/17.0 Safari/605.1.15"
)

EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
Expand Down Expand Up @@ -230,13 +225,21 @@ def run(raw_args):
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob/"
"main/packages/puppeteer-core/src/common/Device.ts",
default="Pixel 2",
)

parser.add_argument(
"--noMobileDevice",
help="Do not emulate a mobile device (use at your own risk, behavior is"
"uncertain)",
action="store_true",
default=False,
)

parser.add_argument(
"--userAgent",
help="Override default user-agent with specified value ; --userAgentSuffix is "
"still applied",
default=DEFAULT_USER_AGENT,
help="Override default user-agent with specified value ; --userAgentSuffix and "
"--adminEmail have no effect when this is set",
)

parser.add_argument(
Expand Down Expand Up @@ -367,7 +370,7 @@ def run(raw_args):
except Exception:
logger.error("Failed to get Browsertrix crawler version")
raise
crawler_version = crawl.stdout
crawler_version = crawl.stdout.strip()
logger.info(f"Browsertrix crawler: version {crawler_version}")

# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
Expand All @@ -384,11 +387,9 @@ def run(raw_args):

url = zimit_args.url

user_agent = zimit_args.userAgent
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}"
user_agent_suffix += f" {zimit_args.adminEmail}"

if url:
url = get_cleaned_url(url)
Expand Down Expand Up @@ -443,8 +444,12 @@ def cleanup():
cmd_args.append("--url")
cmd_args.append(url)

cmd_args.append("--userAgent")
cmd_args.append(user_agent)
cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)

if not zimit_args.noMobileDevice:
cmd_args.append("--mobileDevice")
cmd_args.append(zimit_args.mobileDevice)

cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
Expand Down Expand Up @@ -538,7 +543,7 @@ def get_node_cmd_line(args):
"collection",
"allowHashUrls",
"lang",
"mobileDevice",
"userAgent",
"useSitemap",
"behaviors",
"behaviorTimeout",
Expand Down
6 changes: 3 additions & 3 deletions tests/test_dummy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from zimit.zimit import DEFAULT_USER_AGENT
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE


# dummy test, just to have coverage report done
def test_default_user_agent():
assert DEFAULT_USER_AGENT
def test_something_exists():
assert NORMAL_WARC2ZIM_EXIT_CODE

0 comments on commit f637c3f

Please sign in to comment.