Skip to content

Commit

Permalink
Merge pull request #12 from reese-allison/close-issues
Browse files Browse the repository at this point in the history
Flake8 and resolve issue #7 and #9
  • Loading branch information
moskrc authored Nov 14, 2024
2 parents ed10700 + e21ee59 commit 9b1267e
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 34 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## About CrawlerDetect

This is a Python wrapper for [CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect) - the web crawler detection library
It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers.
It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers.

### Installation
Run `pip install crawlerdetect`
Expand Down
4 changes: 2 additions & 2 deletions crawlerdetect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
"""

from .src import providers
from .src.crawlerdetect import CrawlerDetect
from .src.crawlerdetect import CrawlerDetect, get_crawlerdetect_version

__all__ = ("CrawlerDetect", "providers")
__all__ = ("CrawlerDetect", "providers", "get_crawlerdetect_version")
14 changes: 1 addition & 13 deletions crawlerdetect/__main__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,6 @@
import configparser
import os
import sys


def get_crawlerdetect_version():
config = configparser.ConfigParser()

current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
config_file_path = os.path.join(parent_directory, "setup.cfg")

config.read(config_file_path)

return config["crawlerdetect"]["version"]
from crawlerdetect import get_crawlerdetect_version


if __name__ == "__main__":
Expand Down
21 changes: 21 additions & 0 deletions crawlerdetect/src/crawlerdetect.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
import configparser
import os
import re

from .providers import Crawlers, Exclusions, Headers


def get_crawlerdetect_version():
config = configparser.ConfigParser()

current_directory = os.path.dirname(os.path.abspath(__file__))
grandparent_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
config_file_path = os.path.join(grandparent_directory, "setup.cfg")

config.read(config_file_path)

return config["crawlerdetect"]["version"]


version = get_crawlerdetect_version()


class CrawlerDetect(object):
def __init__(self, headers=None, user_agent=""):
self.crawlers = Crawlers()
Expand All @@ -16,6 +33,10 @@ def __init__(self, headers=None, user_agent=""):
self.setHttpHeaders(headers)
self.setUserAgent(user_agent)

@property
def version(self):
return version

def setHttpHeaders(self, http_headers):
self.httpHeaders = {}

Expand Down
66 changes: 66 additions & 0 deletions tests/fixtures/headers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"test_current_visitor": {
"DOCUMENT_ROOT": "\/home\/test\/public_html",
"GATEWAY_INTERFACE": "CGI\/1.1",
"HTTP_ACCEPT": "*\/*",
"HTTP_ACCEPT_ENCODING": "gzip, deflate",
"HTTP_CACHE_CONTROL": "no-cache",
"HTTP_CONNECTION": "Keep-Alive",
"HTTP_FROM": "bingbot(at)microsoft.com",
"HTTP_HOST": "www.test.com",
"HTTP_PRAGMA": "no-cache",
"HTTP_USER_AGENT": "Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)",
"PATH": "\/bin:\/usr\/bin",
"QUERY_STRING": "order=closingDate",
"REDIRECT_STATUS": "200",
"REMOTE_ADDR": "127.0.0.1",
"REMOTE_PORT": "3360",
"REQUEST_METHOD": "GET",
"REQUEST_URI": "\/?test=testing",
"SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php",
"SCRIPT_NAME": "\/index.php",
"SERVER_ADDR": "127.0.0.1",
"SERVER_ADMIN": "webmaster@test.com",
"SERVER_NAME": "www.test.com",
"SERVER_PORT": "80",
"SERVER_PROTOCOL": "HTTP\/1.1",
"SERVER_SIGNATURE": "",
"SERVER_SOFTWARE": "Apache",
"UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS",
"PHP_SELF": "\/index.php",
"REQUEST_TIME_FLOAT": 1461619728.0705,
"REQUEST_TIME": 1461619728
},
"test_http_from_header": {
"DOCUMENT_ROOT": "\/home\/test\/public_html",
"GATEWAY_INTERFACE": "CGI\/1.1",
"HTTP_ACCEPT": "*\/*",
"HTTP_ACCEPT_ENCODING": "gzip, deflate",
"HTTP_CACHE_CONTROL": "no-cache",
"HTTP_CONNECTION": "Keep-Alive",
"HTTP_FROM": "googlebot(at)googlebot.com",
"HTTP_HOST": "www.test.com",
"HTTP_PRAGMA": "no-cache",
"HTTP_USER_AGENT": "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36",
"PATH": "\/bin:\/usr\/bin",
"QUERY_STRING": "order=closingDate",
"REDIRECT_STATUS": "200",
"REMOTE_ADDR": "127.0.0.1",
"REMOTE_PORT": "3360",
"REQUEST_METHOD": "GET",
"REQUEST_URI": "\/?test=testing",
"SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php",
"SCRIPT_NAME": "\/index.php",
"SERVER_ADDR": "127.0.0.1",
"SERVER_ADMIN": "webmaster@test.com",
"SERVER_NAME": "www.test.com",
"SERVER_PORT": "80",
"SERVER_PROTOCOL": "HTTP\/1.1",
"SERVER_SIGNATURE": "",
"SERVER_SOFTWARE": "Apache",
"UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS",
"PHP_SELF": "\/index.php",
"REQUEST_TIME_FLOAT": 1461619728.0705,
"REQUEST_TIME": 1461619728
}
}
3 changes: 2 additions & 1 deletion tests/fixtures/user_agent/crawlers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3675,4 +3675,5 @@ Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chro
Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Mobile Safari/537.36 (compatible; GoogleOther)
Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.94 Mobile Safari/537.36 (compatible; GoogleOther)
Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)
Mozilla/5.0+(compatible; MonSpark/1.0; http://www.monspark.com/)
37 changes: 20 additions & 17 deletions tests/test_crawlerdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,29 @@
import os
import re

from crawlerdetect import CrawlerDetect
from crawlerdetect import __main__ as main
from crawlerdetect import providers
from crawlerdetect import CrawlerDetect, get_crawlerdetect_version, providers

from .base_case import CrawlerDetectTestCase


with open(os.path.join(os.path.dirname(__file__), "fixtures/headers.json")) as f:
test_headers = json.load(f)


class CrawlerDetectTests(CrawlerDetectTestCase):
def test_get_crawlerdetect_version(self):
version = main.get_crawlerdetect_version()
version = get_crawlerdetect_version()
version_parts = version.split(".")
self.assertEqual(len(version_parts), 3)
self.assertTrue(version_parts[0].isdigit())
self.assertTrue(version_parts[1].isdigit())

def test_is_crawler(self):
res = self.cd.isCrawler(
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
ua = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile "
"(compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
)
res = self.cd.isCrawler(ua)
self.assertTrue(res)

def test_user_agents_are_bots(self):
Expand Down Expand Up @@ -56,9 +60,11 @@ def test_sec_ch_ua_are_devices(self):
self.assertFalse(test, line)

def test_it_returns_correct_matched_bot_name(self):
self.cd.isCrawler(
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
ua = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) "
"Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
)
self.cd.isCrawler(ua)
matches = self.cd.getMatches()
self.assertEqual(self.cd.getMatches(), "monitoring", matches)

Expand All @@ -72,23 +78,20 @@ def test_empty_user_agent(self):
self.assertFalse(test)

def test_current_visitor(self):
headers = json.loads(
'{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"bingbot(at)microsoft.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}'
)
headers = test_headers["test_current_visitor"]
cd = CrawlerDetect(headers=headers)
self.assertTrue(cd.isCrawler())

def test_user_agent_passed_via_contructor(self):
cd = CrawlerDetect(
user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
ua = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; "
"Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
)
cd = CrawlerDetect(user_agent=ua)
self.assertTrue(cd.isCrawler())

def test_http_from_header(self):
headers = json.loads(
'{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"googlebot(at)googlebot.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}'
)
print(headers)
headers = test_headers["test_http_from_header"]
cd = CrawlerDetect(headers=headers)
self.assertTrue(cd.isCrawler())

Expand Down

0 comments on commit 9b1267e

Please sign in to comment.