|
23 | 23 | from __future__ import annotations
|
24 | 24 |
|
25 | 25 | import argparse
|
| 26 | +import concurrent.futures |
26 | 27 | import dataclasses
|
27 | 28 | import datetime as dt
|
28 | 29 | import filecmp
|
@@ -1249,21 +1250,41 @@ def proofread_canonicals(
|
1249 | 1250 | /3/whatsnew/3.11.html, which may not exist yet.
|
1250 | 1251 | """
|
1251 | 1252 | logging.info("Checking canonical links...")
|
1252 |
| - canonical_re = re.compile( |
1253 |
| - """<link rel="canonical" href="https://docs.python.org/([^"]*)" />""" |
1254 |
| - ) |
1255 |
| - for file in www_root.glob("**/*.html"): |
1256 |
| - html = file.read_text(encoding="UTF-8", errors="surrogateescape") |
1257 |
| - canonical = canonical_re.search(html) |
1258 |
| - if not canonical: |
1259 |
| - continue |
1260 |
| - target = canonical.group(1) |
1261 |
| - if not (www_root / target).exists(): |
1262 |
| - logging.info("Removing broken canonical from %s to %s", file, target) |
1263 |
| - html = html.replace(canonical.group(0), "") |
1264 |
| - file.write_text(html, encoding="UTF-8", errors="surrogateescape") |
1265 |
| - if not skip_cache_invalidation: |
1266 |
| - purge(http, str(file).replace("/srv/docs.python.org/", "")) |
| 1253 | + worker_count = (os.cpu_count() or 1) + 2 |
| 1254 | + with concurrent.futures.ThreadPoolExecutor(worker_count) as executor: |
| 1255 | + futures = { |
| 1256 | + executor.submit(_check_canonical_rel, file, www_root) |
| 1257 | + for file in www_root.glob("**/*.html") |
| 1258 | + } |
| 1259 | + paths_to_purge = { |
| 1260 | + res.relative_to(www_root) # strip the leading /srv/docs.python.org |
| 1261 | + for fut in concurrent.futures.as_completed(futures) |
| 1262 | + if (res := fut.result()) is not None |
| 1263 | + } |
| 1264 | + if not skip_cache_invalidation: |
| 1265 | + purge(http, *paths_to_purge) |
| 1266 | + |
| 1267 | + |
| 1268 | +def _check_canonical_rel(file: Path, www_root: Path): |
| 1269 | + # Check for a canonical relation link in the HTML. |
| 1270 | + # If one exists, ensure that the target exists |
| 1271 | + # or otherwise remove the canonical link element. |
| 1272 | + prefix = b'<link rel="canonical" href="https://docs.python.org/' |
| 1273 | + suffix = b'" />' |
| 1274 | + pfx_len = len(prefix) |
| 1275 | + sfx_len = len(suffix) |
| 1276 | + html = file.read_bytes() |
| 1277 | + try: |
| 1278 | + start = html.index(prefix) |
| 1279 | + end = html.index(suffix, start + pfx_len) |
| 1280 | + except ValueError: |
| 1281 | + return None |
| 1282 | + target = html[start + pfx_len : end].decode(errors="surrogateescape") |
| 1283 | + if (www_root / target).exists(): |
| 1284 | + return None |
| 1285 | + logging.info("Removing broken canonical from %s to %s", file, target) |
| 1286 | + file.write_bytes(html[:start] + html[end + sfx_len :]) |
| 1287 | + return file |
1267 | 1288 |
|
1268 | 1289 |
|
1269 | 1290 | def purge(http: urllib3.PoolManager, *paths: Path | str) -> None:
|
|
0 commit comments