Skip to content

Commit e80b729

Browse files
authored
Improve performance for proofread_canonicals() (#258)
1 parent a6a666d commit e80b729

File tree

1 file changed

+36
-15
lines changed

1 file changed

+36
-15
lines changed

Diff for: build_docs.py

+36-15
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from __future__ import annotations
2424

2525
import argparse
26+
import concurrent.futures
2627
import dataclasses
2728
import datetime as dt
2829
import filecmp
@@ -1249,21 +1250,41 @@ def proofread_canonicals(
12491250
/3/whatsnew/3.11.html, which may not exist yet.
12501251
"""
12511252
logging.info("Checking canonical links...")
1252-
canonical_re = re.compile(
1253-
"""<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
1254-
)
1255-
for file in www_root.glob("**/*.html"):
1256-
html = file.read_text(encoding="UTF-8", errors="surrogateescape")
1257-
canonical = canonical_re.search(html)
1258-
if not canonical:
1259-
continue
1260-
target = canonical.group(1)
1261-
if not (www_root / target).exists():
1262-
logging.info("Removing broken canonical from %s to %s", file, target)
1263-
html = html.replace(canonical.group(0), "")
1264-
file.write_text(html, encoding="UTF-8", errors="surrogateescape")
1265-
if not skip_cache_invalidation:
1266-
purge(http, str(file).replace("/srv/docs.python.org/", ""))
1253+
worker_count = (os.cpu_count() or 1) + 2
1254+
with concurrent.futures.ThreadPoolExecutor(worker_count) as executor:
1255+
futures = {
1256+
executor.submit(_check_canonical_rel, file, www_root)
1257+
for file in www_root.glob("**/*.html")
1258+
}
1259+
paths_to_purge = {
1260+
res.relative_to(www_root) # strip the leading /srv/docs.python.org
1261+
for fut in concurrent.futures.as_completed(futures)
1262+
if (res := fut.result()) is not None
1263+
}
1264+
if not skip_cache_invalidation:
1265+
purge(http, *paths_to_purge)
1266+
1267+
1268+
def _check_canonical_rel(file: Path, www_root: Path):
1269+
# Check for a canonical relation link in the HTML.
1270+
# If one exists, ensure that the target exists
1271+
# or otherwise remove the canonical link element.
1272+
prefix = b'<link rel="canonical" href="https://docs.python.org/'
1273+
suffix = b'" />'
1274+
pfx_len = len(prefix)
1275+
sfx_len = len(suffix)
1276+
html = file.read_bytes()
1277+
try:
1278+
start = html.index(prefix)
1279+
end = html.index(suffix, start + pfx_len)
1280+
except ValueError:
1281+
return None
1282+
target = html[start + pfx_len : end].decode(errors="surrogateescape")
1283+
if (www_root / target).exists():
1284+
return None
1285+
logging.info("Removing broken canonical from %s to %s", file, target)
1286+
file.write_bytes(html[:start] + html[end + sfx_len :])
1287+
return file
12671288

12681289

12691290
def purge(http: urllib3.PoolManager, *paths: Path | str) -> None:

0 commit comments

Comments
 (0)