Skip to content

Commit

Permalink
feature: Removing reppy dependency
Browse files Browse the repository at this point in the history
Because of the reppy isn't supported anymore
( see github.com/seomoz/reppy#122 ),
it's functionality replaced by the default python module
urllib.robotparser.RobotFileParser with a small google-oriented
extension.
  • Loading branch information
butuzov committed Apr 11, 2021
1 parent decb06f commit 162d02e
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 18 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ deps: venv ## install requiements


dev-env: deps clean ## Install Development Version
$(PYTHON) -m pip uninstall deadlinks -y
pip uninstall deadlinks -y
pip install -e .


Expand Down
65 changes: 57 additions & 8 deletions deadlinks/robots_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
"""

# -- Imports -------------------------------------------------------------------
from typing import (Any)
from typing import (Any, List, Tuple)

from reppy.robots import Robots
from reppy.exceptions import ReppyException
from urllib.robotparser import RobotFileParser

from .request import user_agent
from .url import URL
Expand All @@ -46,18 +45,68 @@ def allowed(self, url: URL) -> bool:

# We actually can't find out is there robots.txt or not
# so we going to allow all in this case.
if self.state is False:
if self.state is False or self.state.allow_all:
return True

return bool(self.state.allowed(str(url), user_agent))
if not self.state.last_checked and self.state.disallow_all:
return False

# find entry
return allowed(matched_rules(self._entry(), url))

def request(self, url: str) -> None:
""" Perform robots.txt request """
if not (self.state is None):
if self.state is not None:
return

try:
self.state = Robots.fetch(url)
self.state = RobotFileParser()
self.state.set_url(url)
self.state.read()

except ReppyException:
except Exception:
self.state = False

# This is mostly transferred logics from robotparser.py,
# but we trying to follow 2019 extension of the Google's Robots Txt
# protocol and allow, disallowed pathes.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04

def _entry(self) -> Any:

for entry in self.state.entries:
if entry.applies_to(user_agent):
return entry

return self.state.default_entry


def matched_rules(entry: Any, url: URL) -> List[Tuple[bool, str]]:
result: List[Tuple[bool, str]] = []

path = url.path
if not path:
path = "/"

for line in entry.rulelines:
if not line.applies_to(path):
continue

if len(line.path) > len(path):
continue

result.append((
line.allowance,
line.path,
))

return sorted(result, key=lambda x: x[1])


def allowed(rules: List[Tuple[bool, str]]) -> bool:

if not rules:
return True

return rules[-1][0]
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ idna>=2.8
requests>=2.22.0
click>=7.0
urllib3>=1.25.6
reppy==0.4.14
six==1.15.0
PyOpenSSL==19.1.0; python_full_version < '3.6.0'

Expand Down
14 changes: 6 additions & 8 deletions tests/features/tests_robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,15 @@
# -- Imports -------------------------------------------------------------------

import pytest
from flaky import flaky

from copy import deepcopy as copy
from typing import (Optional, Dict)
from typing import Dict

from ..utils import Page

from deadlinks import (Settings, Crawler)
from deadlinks import user_agent

from deadlinks import (
DeadlinksIgnoredURL,
DeadlinksSettingsBase,
)
from deadlinks import DeadlinksIgnoredURL

server_pages = {
'^/$': Page("".join(["<a href='/link-%s'>%s</a>" % (x, x) for x in range(1, 101)])).exists(),
Expand Down Expand Up @@ -97,13 +92,16 @@ def test_failed_domain():
from random import choice
from string import ascii_lowercase

domain = "http://%s.com/" % ''.join([choice(ascii_lowercase) for x in range(42)])
domain = "http://%s.com/" % ''.join(choice(ascii_lowercase) for x in range(42))
c = Crawler(Settings(domain))
c.start()

assert len(c.failed) == 1


# Allow is Deeper then Disallowed.
# https://www.contentkingapp.com/blog/implications-of-new-robots-txt-rfc/
# https://tools.ietf.org/html/draft-koster-rep-04
def test_failed_google():

c = Crawler(
Expand Down

0 comments on commit 162d02e

Please sign in to comment.