Skip to content

Commit

Permalink
Update robots list file and script
Browse files Browse the repository at this point in the history
  • Loading branch information
sakshamarora1 authored and ntarocco committed Dec 18, 2024
1 parent 5cd5e89 commit f751b04
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 16 deletions.
152 changes: 139 additions & 13 deletions counter_robots/data/robot.txt
Original file line number Diff line number Diff line change
@@ -1,99 +1,163 @@
bot
^Buck\/[0-9]
spider
crawl
^.?$
[^a]fish
^voyager\/
^IDA$
^ruby$
^@ozilla\/\d
^脝脝陆芒潞贸碌脛$
^破解后的$
AddThis
A6-Indexer
ADmantX
alexa
Alexandria(\s|\+)prototype(\s|\+)project
AllenTrack
almaden
AntBot
appie
API[\+\s]scraper
Arachni
Arachmo
architext
ArchiveTeam
aria2\/\d
arks
^Array$
asterias
atomz
axios\/\d
BDFetch
Betsie
baidu
biglotron
BingPreview
binlar
bjaaland
Blackboard[\+\s]Safeassign
blaiz\-bee
blaiz-bee
bloglines
blogpulse
boitho\.com\-dc
bookmark\-manager
boitho\.com-dc
bookmark-manager
Brutus\/AET
BUbiNG
bwh3_user_agent
CakePHP
celestial
centuryb
cfnetwork
checklink
checkprivacy
China\sLocal\sBrowse\s2\.6
Citoid
ClaudeBot
cloakDetect
coccoc\/1\.0
Code\sSample\sWeb\sClient
ColdFusion
collection@infegy.com
com\.plumanalytics
combine
contentmatch
ContentSmartz
convera
core
Cortana
CoverScout
crusty\/\d
curl\/
cursor
custo
DataCha0s\/2\.0
daumoa
daum(oa)?
^\%?default\%?$
DeuSu\/
Dispatch\/\d
Docoloc
docomo
Download\+Master
Drupal
DSurf
DTS Agent
EasyBib[\+\s]AutoCite[\+\s]
easydl
EBSCO\sEJS\sContent\sServer
EcoSearch
ELinks\/
EmailSiphon
EmailWolf
Embedly
EThOS\+\(British\+Library\)
facebookexternalhit\/
favorg
Faveeo\/\d
FDM(\s|\+)\d
Feedbin
feedburner
FeedFetcher
feedreader
ferret
Fetch(\s|\+)API(\s|\+)Request
findlinks
findthatfile
^FileDown$
^Filter$
^firefox$
^FOCA
^FreshpingBot\/1.0 \(\+https:\/\/freshping\.io\/\)$
Fulltext
Funnelback
Genieo
GetRight
geturl
GigablastOpenSource
G-i-g-a-b-o-t
GLMSLinkAnalysis
Goldfire(\s|\+)Server
google
Grammarly
GroupHigh\/\d
grub
gulliver
gvfs\/
harvest
heritrix
holmes
htdig
htmlparser
HeadlessChrome
HttpComponents\/1.1
HTTPFetcher
http.?client
httpget
httpx
httrack
ia_archiver
ichiro
iktomi
ilse
Indy Library
insomnia
^integrity\/\d
internetseer
intute
iSiloX
iskanie
^java\/\d{1,2}.\d
jeeves
Jersey\/\d
jobo
Koha
kyluka
larbin
libcurl
libhttp
libwww
lilina
^LinkAnalyser
link.?check
LinkLint-checkonly
^LinkParser\/
Expand All @@ -104,57 +168,100 @@ linkwalker
lipperhey
livejournal\.com
LOCKSS
LongURL.API
ltx71
lycos[\_\+]
mail.ru
mediapartners\-google
lwp
lycos[_+]
MaCoCu
mail\.ru
MarcEdit
mediapartners-google
megite
MetaInspector
MetaURI[\+\s]API\/\d\.\d
Microsoft(\s|\+)URL(\s|\+)Control
Microsoft Office Existence Discovery
Microsoft Office Protocol Discovery
Microsoft-WebDAV-MiniRedir
mimas
mnogosearch
moget
motor
^Mozilla$
^Mozilla.4\.0$
^Mozilla\/4\.0\+\(compatible;\)$
^Mozilla\/4\.0\+\(compatible;\+ICS\)$
^Mozilla\/4\.5\+\[en]\+\(Win98;\+I\)$
^Mozilla.5\.0$
^Mozilla\/5.0\+\(compatible;\+MSIE\+6\.0;\+Windows\+NT\+5\.0\)$
^Mozilla\/5\.0(\s|\+)\(compatible;\s\+centuryb\.o\.t9\[at\]gmail\.com\)$
^Mozilla\/5\.0(\s|\+)\(compatible;\sVelenPublicWebCrawler\/1\.0;\s\+https:\/\/velen\.io\)$
^Mozilla\/5\.0\+like\+Gecko$
^Mozilla\/5.0(\s|\+)Gecko\/20100115(\s|\+)Firefox\/3.6$
^MSIE
MuscatFerre
myweb
nagios
^NetAnts\/\d
netcraft
netluchs
Ning
nettle
newspaper\/\d
ng\/2\.
^Ning\/\d
no_user_agent
nomad
nutch
^oaDOI$
ocelli
Offline(\s|\+)Navigator
OgScrper
okhttp
onetszukaj
^Opera\/4$
OurBrowser
panscient
parsijoo
EasyBib[\+\s]AutoCite[\+\s]
^Pattern\/\d
Pcore-HTTP
pear\.php\.net
perman
PHP\/
pidcheck
pioneer
playmusic\.com
playstarmusic\.com
^Postgenomic(\s|\+)v2
powermarks
proximic
PycURL
python
Qwantify
rambler
ReactorNetty\/\d
Readpaper
redalert
RestSharp
Riddler
robozilla
rss
scan4mail
scientificcommons
scirus
scooter
Scrapy\/\d
ScoutJet
^scrutiny\/\d
SearchBloxIntra
sfFeedReader\/0\.9
shoutcast
Site24x7
SkypeUriPreview
slurp
sogou
speedy
sqlmap
SrceDAMP
Strider
summify
sunrise
Expand All @@ -163,19 +270,30 @@ T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E
tailrank
Teleport(\s|\+)Pro
Teoma
The[\+\s]Knowledge[\+\s]AI
titan
^Traackr\.com$
Trello
Trove
Turnitin
twiceler
Typhoeus
ucsd
ultraseek
^undefined$
^unknown$
Unpaywall
URL2File
urlaliasbuilder
urllib
^user.?agent$
^User-Agent
validator
virus.detector
voila
^voltron$
voyager\/
w3af.org
w3af\.org
Wanadoo
Web(\s|\+)Downloader
WebCloner
Expand All @@ -186,16 +304,24 @@ weblayers
Webmetrics
webmirror
webmon
weborama-fetcher
webreaper
WebStripper
WebZIP
Wget
WhatsApp
wordpress
worm
www.gnip.com
WWW\-Mechanize
www\.gnip\.com
WWW-Mechanize
xenu
y!j
yacy
yahoo
yandex
Yeti\/\d
Zabbix
ZoteroTranslationServer
zeus
zyborg
7siters
4 changes: 2 additions & 2 deletions scripts/update-lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
'https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/'
'master/user-agents/lists/machine.txt'),
('robot.txt',
'https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/'
'master/user-agents/lists/robot.txt'),
'https://raw.githubusercontent.com/atmire/COUNTER-Robots/'
'master/generated/COUNTER_Robots_list.txt'),
]

def _get_package_path(filename):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_version():


def test_is_robot():
machine_ua = 'Wget/1.14 (linux-gnu)'
machine_ua = 'PostmanRuntime/7.30.0'
robot_ua = 'AdsBot-Google (+http://www.google.com/adsbot.html)'
assert is_robot(machine_ua) is not True
assert is_robot(robot_ua) is True
Expand Down

0 comments on commit f751b04

Please sign in to comment.