From fe8a448ce467dbfbc1e8a503e0ad5972e562c14e Mon Sep 17 00:00:00 2001 From: Andre Burgaud Date: Wed, 1 Jan 2020 23:11:03 -0600 Subject: [PATCH 1/2] issue39817: robotparser does not respect longest match --- Lib/test/test_robotparser.py | 38 +++++++++++++++++++++++++++++++++++- Lib/urllib/robotparser.py | 17 ++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index f28d8be5f3c4ec..26693af72322be 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -224,6 +224,42 @@ class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): bad = ['/folder1/anotherfile.html'] +class LongestMatchUserAgentTest(BaseRobotTest, unittest.TestCase): + # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 + # The most specific rule should be used + robots_txt = """\ +User-agent: FooBot +Disallow: /folder1/ +Allow: /folder1/myfile.html + """ + agent = 'foobot' + good = ['/folder1/myfile.html'] + bad = ['/folder1/anotherfile.html'] + + +class LongestMatchDefaultUserAgentTest(BaseRobotTest, unittest.TestCase): + # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 + # The most specific rule should be used + robots_txt = """\ +User-agent: * +Disallow: /folder1/ +Allow: /folder1/myfile.html + """ + good = ['/folder1/myfile.html'] + bad = ['/folder1/anotherfile.html'] + + +class EquivalentRulesTest(BaseRobotTest, unittest.TestCase): + # https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2 + # The most specific rule should be used + robots_txt = """\ +User-agent: * +Disallow: /folder1/ +Allow: /folder1/ + """ + good = ['/folder1/myfile.html', '/folder1', '/folder1'] + + class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): # see issue #6325 for details robots_txt = """\ @@ -367,7 +403,7 @@ def test_basic(self): def test_can_fetch(self): self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) - self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) + self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian'))) self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) self.assertTrue(self.parser.can_fetch('*', self.base_url)) diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index c58565e3945146..c6bd4ca7925673 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -56,6 +56,12 @@ def set_url(self, url): self.url = url self.host, self.path = urllib.parse.urlparse(url)[1:3] + def _sort_rulelines(self): + for entry in self.entries: + entry.sort_rulelines() + if self.default_entry: + self.default_entry.sort_rulelines() + def read(self): """Reads the robots.txt URL and feeds it to the parser.""" try: @@ -150,6 +156,7 @@ def parse(self, lines): self.sitemaps.append(line[1]) if state == 2: self._add_entry(entry) + self._sort_rulelines() def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" @@ -250,6 +257,16 @@ def __str__(self): ret.extend(map(str, self.rulelines)) return '\n'.join(ret) + def sort_rulelines(self): + """Rules need to be sorted with the longest path first to ensure that + the longest rule is used for matching: + https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 + + If an allow and disallow rule is equivalent (same path), the allow + SHOULD be used: https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2 + """ + self.rulelines.sort(key=lambda x: (len(x.path), x.allowance), reverse=True) + def applies_to(self, useragent): """check if this entry applies to the specified agent""" # split the name token and make it lower case From 18a3cb161c91f88c043baf1115aaf81e60636492 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2020 05:38:41 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst diff --git a/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst b/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst new file mode 100644 index 00000000000000..70499b4dbdc5c1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-01-02-05-38-40.bpo-39187.0vXJVT.rst @@ -0,0 +1,2 @@ +Add a sort function to respect the longest match rule as per the current internet draft: https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 +The sort function also takes into account equivalent rules such that allow should be used: https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2 \ No newline at end of file