Skip to content

Commit

Permalink
Add #anchor checking to 'linkcheck' builder.
Browse files Browse the repository at this point in the history
This requires us to download the document and parse its HTML.
  • Loading branch information
intgr committed Feb 28, 2012
1 parent 64593b9 commit e0e9d2a
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 9 deletions.
8 changes: 8 additions & 0 deletions doc/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,14 @@ Options for the linkcheck builder

.. versionadded:: 1.1

.. confval:: linkcheck_anchors

True or false, whether to check the existence of #anchor in links. Since
this requires downloading the whole document, it's considerably slower
when enabled. Default is ``True``.

.. versionadded:: 1.2


.. rubric:: Footnotes

Expand Down
73 changes: 64 additions & 9 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import socket
import threading
from os import path
from urllib2 import build_opener, Request
from urllib2 import build_opener, unquote, Request
from HTMLParser import HTMLParser, HTMLParseError

from docutils import nodes

Expand All @@ -33,6 +34,42 @@ def get_method(self):
return 'HEAD'


class AnchorCheckParser(HTMLParser):
def __init__(self, search_anchor):
HTMLParser.__init__(self)

self.search_anchor = search_anchor
self.found = False

def handle_starttag(self, tag, attrs):
for key, value in attrs:
if key in ('id', 'name') and value == self.search_anchor:
self.found = True

def check_anchor(f, hash):
"""Reads HTML data from a filelike object 'f' searching for anchor 'hash'.
Returns True if anchor was found, False otherwise"""

parser = AnchorCheckParser(hash)

try:
# Read file in chunks of 8192 bytes. If we find a matching anchor, we
# break the loop early in hopes not to have to download the whole thing

chunk = f.read(8192)
while chunk and not parser.found:
parser.feed(chunk)
chunk = f.read(8192)

parser.close()
except HTMLParseError:
# HTMLParser is usually pretty good with sloppy HTML, but it tends to
# choke on EOF. But we're done then anyway.
pass

return parser.found

class CheckExternalLinksBuilder(Builder):
"""
Checks for broken external links.
Expand Down Expand Up @@ -66,7 +103,7 @@ def check_thread(self):

def check():
# check for various conditions without bothering the network
if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
if len(uri) == 0 or uri[0] == '#' or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
return 'unchecked', ''
elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
return 'local', ''
Expand All @@ -80,19 +117,39 @@ def check():
if rex.match(uri):
return 'ignored', ''

if '#' in uri:
req_url, hash = uri.split('#', 1)
else:
req_url = uri
hash = None

# need to actually check the URI
try:
f = opener.open(HeadRequest(uri), **kwargs)
f.close()
if hash and self.app.config.linkcheck_anchors:
# Read the whole document and see if #hash exists
f = opener.open(Request(req_url), **kwargs)
found = check_anchor(f, unquote(hash))
f.close()

if not found:
raise Exception("Anchor '%s' not found" % hash)
else:
f = opener.open(HeadRequest(req_url), **kwargs)
f.close()

except Exception, err:
self.broken[uri] = str(err)
return 'broken', str(err)
if f.url.rstrip('/') == uri.rstrip('/'):
if f.url.rstrip('/') == req_url.rstrip('/'):
self.good.add(uri)
return 'working', 'new'
else:
self.redirected[uri] = f.url
return 'redirected', f.url
new_url = f.url
if hash:
new_url += '#' + hash

self.redirected[uri] = new_url
return 'redirected', new_url

while True:
uri, docname, lineno = self.wqueue.get()
Expand Down Expand Up @@ -142,8 +199,6 @@ def write_doc(self, docname, doctree):
if 'refuri' not in node:
continue
uri = node['refuri']
if '#' in uri:
uri = uri.split('#')[0]
lineno = None
while lineno is None:
node = node.parent
Expand Down
1 change: 1 addition & 0 deletions sphinx/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ class Config(object):
linkcheck_ignore = ([], None),
linkcheck_timeout = (None, None),
linkcheck_workers = (5, None),
linkcheck_anchors = (True, None),

# gettext options
gettext_compact = (True, 'gettext'),
Expand Down

0 comments on commit e0e9d2a

Please sign in to comment.