From f751b04e6bfb80b0034b99ad65d7246457a0cdcf Mon Sep 17 00:00:00 2001 From: Saksham Date: Wed, 18 Dec 2024 16:01:44 +0100 Subject: [PATCH] Update robots list file and script --- counter_robots/data/robot.txt | 152 +++++++++++++++++++++++++++++++--- scripts/update-lists.py | 4 +- tests/test_robots.py | 2 +- 3 files changed, 142 insertions(+), 16 deletions(-) diff --git a/counter_robots/data/robot.txt b/counter_robots/data/robot.txt index 362716a..da3f8bb 100644 --- a/counter_robots/data/robot.txt +++ b/counter_robots/data/robot.txt @@ -1,42 +1,63 @@ bot +^Buck\/[0-9] spider crawl +^.?$ [^a]fish -^voyager\/ +^IDA$ +^ruby$ +^@ozilla\/\d +^脝脝陆芒潞贸碌脛$ +^破解后的$ +AddThis +A6-Indexer ADmantX alexa Alexandria(\s|\+)prototype(\s|\+)project AllenTrack almaden +AntBot appie API[\+\s]scraper +Arachni Arachmo architext ArchiveTeam +aria2\/\d arks +^Array$ asterias atomz +axios\/\d BDFetch +Betsie baidu biglotron BingPreview binlar +bjaaland Blackboard[\+\s]Safeassign -blaiz\-bee +blaiz-bee bloglines blogpulse -boitho\.com\-dc -bookmark\-manager +boitho\.com-dc +bookmark-manager Brutus\/AET BUbiNG bwh3_user_agent +CakePHP celestial +centuryb cfnetwork checklink checkprivacy China\sLocal\sBrowse\s2\.6 +Citoid +ClaudeBot cloakDetect coccoc\/1\.0 +Code\sSample\sWeb\sClient +ColdFusion collection@infegy.com com\.plumanalytics combine @@ -44,56 +65,99 @@ contentmatch ContentSmartz convera core +Cortana CoverScout +crusty\/\d +curl\/ cursor custo DataCha0s\/2\.0 -daumoa +daum(oa)? +^\%?default\%?$ DeuSu\/ +Dispatch\/\d Docoloc docomo +Download\+Master +Drupal DSurf DTS Agent +EasyBib[\+\s]AutoCite[\+\s] easydl +EBSCO\sEJS\sContent\sServer +EcoSearch +ELinks\/ EmailSiphon EmailWolf Embedly EThOS\+\(British\+Library\) facebookexternalhit\/ +favorg +Faveeo\/\d +FDM(\s|\+)\d +Feedbin feedburner FeedFetcher feedreader ferret +Fetch(\s|\+)API(\s|\+)Request findlinks +findthatfile +^FileDown$ +^Filter$ +^firefox$ +^FOCA +^FreshpingBot\/1.0 \(\+https:\/\/freshping\.io\/\)$ Fulltext Funnelback +Genieo +GetRight +geturl +GigablastOpenSource G-i-g-a-b-o-t +GLMSLinkAnalysis Goldfire(\s|\+)Server google Grammarly +GroupHigh\/\d grub gulliver +gvfs\/ harvest heritrix holmes htdig htmlparser +HeadlessChrome +HttpComponents\/1.1 HTTPFetcher +http.?client +httpget +httpx httrack ia_archiver ichiro iktomi ilse +Indy Library +insomnia ^integrity\/\d internetseer intute iSiloX iskanie +^java\/\d{1,2}.\d jeeves +Jersey\/\d jobo +Koha kyluka larbin +libcurl +libhttp +libwww lilina +^LinkAnalyser link.?check LinkLint-checkonly ^LinkParser\/ @@ -104,57 +168,100 @@ linkwalker lipperhey livejournal\.com LOCKSS +LongURL.API ltx71 -lycos[\_\+] -mail.ru -mediapartners\-google +lwp +lycos[_+] +MaCoCu +mail\.ru +MarcEdit +mediapartners-google megite +MetaInspector MetaURI[\+\s]API\/\d\.\d +Microsoft(\s|\+)URL(\s|\+)Control +Microsoft Office Existence Discovery +Microsoft Office Protocol Discovery +Microsoft-WebDAV-MiniRedir mimas mnogosearch moget motor +^Mozilla$ +^Mozilla.4\.0$ +^Mozilla\/4\.0\+\(compatible;\)$ +^Mozilla\/4\.0\+\(compatible;\+ICS\)$ +^Mozilla\/4\.5\+\[en]\+\(Win98;\+I\)$ +^Mozilla.5\.0$ +^Mozilla\/5.0\+\(compatible;\+MSIE\+6\.0;\+Windows\+NT\+5\.0\)$ +^Mozilla\/5\.0(\s|\+)\(compatible;\s\+centuryb\.o\.t9\[at\]gmail\.com\)$ +^Mozilla\/5\.0(\s|\+)\(compatible;\sVelenPublicWebCrawler\/1\.0;\s\+https:\/\/velen\.io\)$ +^Mozilla\/5\.0\+like\+Gecko$ +^Mozilla\/5.0(\s|\+)Gecko\/20100115(\s|\+)Firefox\/3.6$ +^MSIE MuscatFerre myweb nagios ^NetAnts\/\d netcraft netluchs -Ning +nettle +newspaper\/\d +ng\/2\. +^Ning\/\d +no_user_agent nomad nutch ^oaDOI$ ocelli Offline(\s|\+)Navigator +OgScrper +okhttp onetszukaj +^Opera\/4$ OurBrowser panscient parsijoo -EasyBib[\+\s]AutoCite[\+\s] +^Pattern\/\d +Pcore-HTTP +pear\.php\.net perman +PHP\/ +pidcheck pioneer playmusic\.com playstarmusic\.com ^Postgenomic(\s|\+)v2 powermarks proximic +PycURL +python Qwantify +rambler +ReactorNetty\/\d Readpaper redalert +RestSharp Riddler robozilla +rss scan4mail scientificcommons scirus scooter Scrapy\/\d +ScoutJet ^scrutiny\/\d SearchBloxIntra +sfFeedReader\/0\.9 shoutcast +Site24x7 SkypeUriPreview slurp sogou speedy +sqlmap +SrceDAMP Strider summify sunrise @@ -163,19 +270,30 @@ T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E tailrank Teleport(\s|\+)Pro Teoma +The[\+\s]Knowledge[\+\s]AI titan ^Traackr\.com$ +Trello Trove +Turnitin twiceler +Typhoeus ucsd ultraseek +^undefined$ +^unknown$ +Unpaywall +URL2File urlaliasbuilder +urllib +^user.?agent$ +^User-Agent validator virus.detector voila ^voltron$ voyager\/ -w3af.org +w3af\.org Wanadoo Web(\s|\+)Downloader WebCloner @@ -186,16 +304,24 @@ weblayers Webmetrics webmirror webmon +weborama-fetcher webreaper WebStripper WebZIP +Wget +WhatsApp +wordpress worm -www.gnip.com -WWW\-Mechanize +www\.gnip\.com +WWW-Mechanize xenu y!j yacy yahoo yandex +Yeti\/\d +Zabbix +ZoteroTranslationServer zeus zyborg +7siters diff --git a/scripts/update-lists.py b/scripts/update-lists.py index 03904dd..e4926fa 100644 --- a/scripts/update-lists.py +++ b/scripts/update-lists.py @@ -31,8 +31,8 @@ 'https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/' 'master/user-agents/lists/machine.txt'), ('robot.txt', - 'https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/' - 'master/user-agents/lists/robot.txt'), + 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/' + 'master/generated/COUNTER_Robots_list.txt'), ] def _get_package_path(filename): diff --git a/tests/test_robots.py b/tests/test_robots.py index 7d6b2fa..f5b81f9 100644 --- a/tests/test_robots.py +++ b/tests/test_robots.py @@ -20,7 +20,7 @@ def test_version(): def test_is_robot(): - machine_ua = 'Wget/1.14 (linux-gnu)' + machine_ua = 'PostmanRuntime/7.30.0' robot_ua = 'AdsBot-Google (+http://www.google.com/adsbot.html)' assert is_robot(machine_ua) is not True assert is_robot(robot_ua) is True