feature: Removing reppy dependency

Because of the reppy isn't supported anymore ( see github.com/seomoz/reppy#122 ), it's functionality replaced by the default python module urllib.robotparser.RobotFileParser with a small google-oriented extension.
butuzov · Apr 11, 2021 · 162d02e · 162d02e
1 parent decb06f
commit 162d02e
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -42,7 +42,7 @@ deps: venv ## install requiements
 
 
 dev-env: deps clean ## Install Development Version
-	$(PYTHON) -m pip uninstall deadlinks -y
+	pip uninstall deadlinks -y
 	pip install -e .
 
 

diff --git a/deadlinks/robots_txt.py b/deadlinks/robots_txt.py
@@ -23,10 +23,9 @@
 """
 
 # -- Imports -------------------------------------------------------------------
-from typing import (Any)
+from typing import (Any, List, Tuple)
 
-from reppy.robots import Robots
-from reppy.exceptions import ReppyException
+from urllib.robotparser import RobotFileParser
 
 from .request import user_agent
 from .url import URL
@@ -46,18 +45,68 @@ def allowed(self, url: URL) -> bool:
 
         # We actually can't find out is there robots.txt or not
         # so we going to allow all in this case.
-        if self.state is False:
+        if self.state is False or self.state.allow_all:
             return True
 
-        return bool(self.state.allowed(str(url), user_agent))
+        if not self.state.last_checked and self.state.disallow_all:
+            return False
+
+        # find entry
+        return allowed(matched_rules(self._entry(), url))
 
     def request(self, url: str) -> None:
         """ Perform robots.txt request """
-        if not (self.state is None):
+        if self.state is not None:
             return
 
         try:
-            self.state = Robots.fetch(url)
+            self.state = RobotFileParser()
+            self.state.set_url(url)
+            self.state.read()
 
-        except ReppyException:
+        except Exception:
             self.state = False
+
+    # This is mostly transferred logics from robotparser.py,
+    # but we trying to follow 2019 extension of the Google's Robots Txt
+    # protocol and allow, disallowed pathes.
+    # https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
+    # https://tools.ietf.org/html/draft-koster-rep-04
+
+    def _entry(self) -> Any:
+
+        for entry in self.state.entries:
+            if entry.applies_to(user_agent):
+                return entry
+
+        return self.state.default_entry
+
+
+def matched_rules(entry: Any, url: URL) -> List[Tuple[bool, str]]:
+    result: List[Tuple[bool, str]] = []
+
+    path = url.path
+    if not path:
+        path = "/"
+
+    for line in entry.rulelines:
+        if not line.applies_to(path):
+            continue
+
+        if len(line.path) > len(path):
+            continue
+
+        result.append((
+            line.allowance,
+            line.path,
+        ))
+
+    return sorted(result, key=lambda x: x[1])
+
+
+def allowed(rules: List[Tuple[bool, str]]) -> bool:
+
+    if not rules:
+        return True
+
+    return rules[-1][0]
diff --git a/requirements.txt b/requirements.txt
@@ -27,7 +27,6 @@ idna>=2.8
 requests>=2.22.0
 click>=7.0
 urllib3>=1.25.6
-reppy==0.4.14
 six==1.15.0
 PyOpenSSL==19.1.0; python_full_version < '3.6.0'
 

diff --git a/tests/features/tests_robots.py b/tests/features/tests_robots.py
@@ -11,20 +11,15 @@
 # -- Imports -------------------------------------------------------------------
 
 import pytest
-from flaky import flaky
 
 from copy import deepcopy as copy
-from typing import (Optional, Dict)
+from typing import Dict
 
 from ..utils import Page
 
 from deadlinks import (Settings, Crawler)
-from deadlinks import user_agent
 
-from deadlinks import (
-    DeadlinksIgnoredURL,
-    DeadlinksSettingsBase,
-)
+from deadlinks import DeadlinksIgnoredURL
 
 server_pages = {
     '^/$': Page("".join(["<a href='/link-%s'>%s</a>" % (x, x) for x in range(1, 101)])).exists(),
@@ -97,13 +92,16 @@ def test_failed_domain():
     from random import choice
     from string import ascii_lowercase
 
-    domain = "http://%s.com/" % ''.join([choice(ascii_lowercase) for x in range(42)])
+    domain = "http://%s.com/" % ''.join(choice(ascii_lowercase) for x in range(42))
     c = Crawler(Settings(domain))
     c.start()
 
     assert len(c.failed) == 1
 
 
+# Allow is Deeper then Disallowed.
+# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
+# https://tools.ietf.org/html/draft-koster-rep-04
 def test_failed_google():
 
     c = Crawler(