pypa · dstufft · Jun 5, 2013 · Jun 2, 2013 · Jun 2, 2013 · Jun 2, 2013
diff --git a/pip/index.py b/pip/index.py
@@ -31,6 +31,7 @@
 from pip.download import urlopen, path_to_url2, url_to_path, geturl, Urllib2HeadRequest
 from pip.wheel import Wheel, wheel_ext, wheel_distribute_support, distribute_requirement
 from pip.pep425tags import supported_tags
+from pip.vendor import html5lib
 
 __all__ = ['PackageFinder']
 
@@ -475,13 +476,11 @@ class HTMLPage(object):
     ## FIXME: these regexes are horrible hacks:
     _homepage_re = re.compile(r'<th>\s*home\s*page', re.I)
     _download_re = re.compile(r'<th>\s*download\s+url', re.I)
-    ## These aren't so aweful:
-    _rel_re = re.compile("""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
     _href_re = re.compile('href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))', re.I|re.S)
-    _base_re = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I)
 
     def __init__(self, content, url, headers=None):
         self.content = content
+        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
         self.url = url
         self.headers = headers
 
@@ -602,20 +601,21 @@ def _get_content_type(url):
     @property
     def base_url(self):
         if not hasattr(self, "_base_url"):
-            match = self._base_re.search(self.content)
-            if match:
-                self._base_url = match.group(1)
+            base = self.parsed.find(".//base")
+            if base is not None and base.get("href"):
+                self._base_url = base.get("href")
             else:
                 self._base_url = self.url
         return self._base_url
 
     @property
     def links(self):
         """Yields all links in the page"""
-        for match in self._href_re.finditer(self.content):
-            url = match.group(1) or match.group(2) or match.group(3)
-            url = self.clean_link(urlparse.urljoin(self.base_url, url))
-            yield Link(url, self)
+        for anchor in self.parsed.findall(".//a"):
+            if anchor.get("href"):
+                href = anchor.get("href")
+                url = self.clean_link(urlparse.urljoin(self.base_url, href))
+                yield Link(url, self)
 
     def rel_links(self):
         for url in self.explicit_rel_links():
@@ -625,21 +625,20 @@ def rel_links(self):
 
     def explicit_rel_links(self, rels=('homepage', 'download')):
         """Yields all links with the given relations"""
-        for match in self._rel_re.finditer(self.content):
-            found_rels = match.group(1).lower().split()
-            for rel in rels:
-                if rel in found_rels:
-                    break
-            else:
-                continue
-            match = self._href_re.search(match.group(0))
-            if not match:
-                continue
-            url = match.group(1) or match.group(2) or match.group(3)
-            url = self.clean_link(urlparse.urljoin(self.base_url, url))
-            yield Link(url, self)
+        rels = set(rels)
+
+        for anchor in self.parsed.findall(".//a"):
+            if anchor.get("rel") and anchor.get("href"):
+                found_rels = set(anchor.get("rel").split())
+                # Determine the intersection between what rels were found and
+                #   what rels were being looked for
+                if found_rels & rels:
+                    href = anchor.get("href")
+                    url = self.clean_link(urlparse.urljoin(self.base_url, href))
+                    yield Link(url, self)
 
     def scraped_rel_links(self):
+        # Can we get rid of this horrible horrible method?
         for regex in (self._homepage_re, self._download_re):
             match = regex.search(self.content)
             if not match:

diff --git a/pip/vendor/__init__.py b/pip/vendor/__init__.py
@@ -5,3 +5,27 @@
 Files inside of pip.vendor should be considered immutable and should only be
 updated to versions from upstream.
 """
+from __future__ import absolute_import
+
+# Monkeypatch pip.vendor.six into just six
+#   This is kind of terrible, however it is the least bad of 3 bad options
+#   #1 Ship pip with ``six`` such that it gets installed as a regular module
+#   #2 Modify pip.vendor.html5lib so that instead of ``import six`` it uses
+#       ``from pip.vendor import six``.
+#   #3 This monkeypatch which adds six to the top level modules only when
+#       pip.vendor.* is being used.
+#
+#   #1 involves pollutiong the globally installed packages and possibly
+#   preventing people from using older or newer versions of the six library
+#   #2 Means we've modified upstream which makes it more dificult to upgrade
+#   in the future and paves the way for us to be in charge of maintaining it.
+#   #3 Allows us to not modify upstream while only pollutiong the global
+#   namespace when ``pip.vendor`` has been imported, which in typical usage
+#   is isolated to command line evocations.
+try:
+    import six
+except ImportError:
+    import sys
+    from . import six
+
+    sys.modules["six"] = six
diff --git a/pip/vendor/html5lib/LICENSE b/pip/vendor/html5lib/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2013 James Graham and other contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/pip/vendor/html5lib/__init__.py b/pip/vendor/html5lib/__init__.py
@@ -0,0 +1,23 @@
+"""
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+tree = html5lib.parse(f)
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+__version__ = "1.0b1"