From fe8a448ce467dbfbc1e8a503e0ad5972e562c14e Mon Sep 17 00:00:00 2001
From: Andre Burgaud <andre.burgaud@gmail.com>
Date: Wed, 1 Jan 2020 23:11:03 -0600
Subject: [PATCH 1/2] issue39817: robotparser does not respect longest match

---
 Lib/test/test_robotparser.py | 38 +++++++++++++++++++++++++++++++++++-
 Lib/urllib/robotparser.py    | 17 ++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index f28d8be5f3c4ec..26693af72322be 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -224,6 +224,42 @@ class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
     bad = ['/folder1/anotherfile.html']
 
 
+class LongestMatchUserAgentTest(BaseRobotTest, unittest.TestCase):
+    # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2
+    # The most specific rule should be used
+    robots_txt = """\
+User-agent: FooBot
+Disallow: /folder1/
+Allow: /folder1/myfile.html
+    """
+    agent = 'foobot'
+    good = ['/folder1/myfile.html']
+    bad = ['/folder1/anotherfile.html']
+
+
+class LongestMatchDefaultUserAgentTest(BaseRobotTest, unittest.TestCase):
+    # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2
+    # The most specific rule should be used
+    robots_txt = """\
+User-agent: *
+Disallow: /folder1/
+Allow: /folder1/myfile.html
+    """
+    good = ['/folder1/myfile.html']
+    bad = ['/folder1/anotherfile.html']
+
+
+class EquivalentRulesTest(BaseRobotTest, unittest.TestCase):
+    # https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2
+    # The most specific rule should be used
+    robots_txt = """\
+User-agent: *
+Disallow: /folder1/
+Allow: /folder1/
+    """
+    good = ['/folder1/myfile.html', '/folder1', '/folder1']
+
+
 class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
     # see issue #6325 for details
     robots_txt = """\
@@ -367,7 +403,7 @@ def test_basic(self):
     def test_can_fetch(self):
         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
-        self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
+        self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
         self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
         self.assertTrue(self.parser.can_fetch('*', self.base_url))
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index c58565e3945146..c6bd4ca7925673 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -56,6 +56,12 @@ def set_url(self, url):
         self.url = url
         self.host, self.path = urllib.parse.urlparse(url)[1:3]
 
+    def _sort_rulelines(self):
+        for entry in self.entries:
+            entry.sort_rulelines()
+        if self.default_entry:
+            self.default_entry.sort_rulelines()
+
     def read(self):
         """Reads the robots.txt URL and feeds it to the parser."""
         try:
@@ -150,6 +156,7 @@ def parse(self, lines):
                     self.sitemaps.append(line[1])
         if state == 2:
             self._add_entry(entry)
+        self._sort_rulelines()
 
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
@@ -250,6 +257,16 @@ def __str__(self):
         ret.extend(map(str, self.rulelines))
         return '\n'.join(ret)
 
+    def sort_rulelines(self):
+        """Rules need to be sorted with the longest path first to ensure that
+        the longest rule is used for matching:
+        https://tools.ietf.org/html/draft-koster-rep-00#section-3.2
+
+        If an allow and disallow rule is equivalent (same path), the allow
+        SHOULD be used: https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2
+        """
+        self.rulelines.sort(key=lambda x: (len(x.path), x.allowance), reverse=True)
+
     def applies_to(self, useragent):
         """check if this entry applies to the specified agent"""
         # split the name token and make it lower case

From 18a3cb161c91f88c043baf1115aaf81e60636492 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2020 05:38:41 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst       | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst

diff --git a/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst b/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst
new file mode 100644
index 00000000000000..70499b4dbdc5c1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst
@@ -0,0 +1,2 @@
+Add a sort function to respect the longest match rule as per the current internet draft: https://tools.ietf.org/html/draft-koster-rep-00#section-3.2
+The sort function also takes into account equivalent rules such that allow should be used: https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2
\ No newline at end of file