From b1211209799d8aec743b2ed53abab673dd57fee7 Mon Sep 17 00:00:00 2001 From: Graham Plumb Date: Tue, 30 Apr 2019 17:32:12 +0100 Subject: [PATCH] Bot additions and removal of false positives Up to https://github.com/JayBizzle/Crawler-Detect/pull/325 --- .../NetCrawlerDetect.Tests.csproj | 2 +- .../NetCrawlerDetect.Tests/crawlers.txt | 35 +++++++++++++++++- .../NetCrawlerDetect.Tests/devices.txt | 6 +-- NetCrawlerDetect/NetCrawlerDetect.sln | 5 ++- .../NetCrawlerDetect/Fixtures/Crawlers.cs | 37 +++++++++++++++++-- .../NetCrawlerDetect/Fixtures/Exclusions.cs | 1 + .../NetCrawlerDetect/NetCrawlerDetect.csproj | 6 +-- 7 files changed, 78 insertions(+), 14 deletions(-) diff --git a/NetCrawlerDetect/NetCrawlerDetect.Tests/NetCrawlerDetect.Tests.csproj b/NetCrawlerDetect/NetCrawlerDetect.Tests/NetCrawlerDetect.Tests.csproj index 58f4e6e..ab31da3 100644 --- a/NetCrawlerDetect/NetCrawlerDetect.Tests/NetCrawlerDetect.Tests.csproj +++ b/NetCrawlerDetect/NetCrawlerDetect.Tests/NetCrawlerDetect.Tests.csproj @@ -4,7 +4,7 @@ netcoreapp2.0 false - 0.1.9 + 0.2.0 diff --git a/NetCrawlerDetect/NetCrawlerDetect.Tests/crawlers.txt b/NetCrawlerDetect/NetCrawlerDetect.Tests/crawlers.txt index 113b75f..4670322 100644 --- a/NetCrawlerDetect/NetCrawlerDetect.Tests/crawlers.txt +++ b/NetCrawlerDetect/NetCrawlerDetect.Tests/crawlers.txt @@ -3410,4 +3410,37 @@ Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) SilverReader/1.0; http://silverreader.com ExtractorPro WebsiteExtractor -Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1 google_partner_monitoring FWSzVTDDBz14547302713138T \ No newline at end of file +Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1 google_partner_monitoring FWSzVTDDBz14547302713138T +Outlook-iOS/696.1188109.prod.iphone (2.102.0) +Polymail/1.47 (ctrlla.Polymail; build:64; OS X 10.12.6) Alamofire/4.5.1 +nyawc/1.8.1 CPython/3.6.5 Linux/3.10.0-862.9.1.el7.x86_64 +commonscan.org cralwer v1.01 +AdminLabs +WebGazer/1.0 (+https://www.webgazer.io) +HappyApps-WebCheck/1.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6; +feeder.co) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36 +Mozilla/5.0 (compatible; TestURI; +http://testuri.org/) +WebSniffer/1.0 (+http://websniffer.cc/) +Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 SeoSiteCheckup (https://seositecheckup.co +Adstxtaggregator.com/1.0 +Discourse Forum Onebox v2.0.0.beta10 +faviconarchive.org +GreatNews/1.0 +MemGator:1.0-rc2 <@WebSciDL> +URLTester/1 CFNetwork/974.1 Darwin/18.0.0 (x86_64) +URLTester/1 CFNetwork/975.0.3 Darwin/18.2.0 (x86_64) +Urlcheckr/2.0 +SimpleChecker +reqwest/0.9.9 +ReactorNetty/0.7.10.RELEASE +Mozilla/5.0 (compatible; woorankreview/2.0; +https://www.woorank.com/) +https://www.nominet.uk/privacy-notice + UnChaos From Chaos To Order Hybrid Web Search Engine.(vadim_gonchar@unchaos.com) +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/69.0.3464.0 Safari/537.36 Chrome-Lighthouse +Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3116.0 Mobile Safari/537.36 Chrome-Lighthouse +Trackuity (+http://trackuity.com) +deeris/1.0 (+http://www.deeris.com) +Mediametric +TextRazor Downloader (https://www.textrazor.com) +ddline.cn rank history +Web spyder \ No newline at end of file diff --git a/NetCrawlerDetect/NetCrawlerDetect.Tests/devices.txt b/NetCrawlerDetect/NetCrawlerDetect.Tests/devices.txt index c82550b..0fae60b 100644 --- a/NetCrawlerDetect/NetCrawlerDetect.Tests/devices.txt +++ b/NetCrawlerDetect/NetCrawlerDetect.Tests/devices.txt @@ -30260,7 +30260,6 @@ Mozilla/5.0 (Linux; Android 4.4.4; GT-I9300 Build/KTU84Q) AppleWebKit/537.36 (KH Mozilla/5.0 (Linux; Android 4.4.4; G7-L01 Build/HuaweiG7-L01) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.89 Mobile Safari/537.36 Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 (5298492416) Mozilla/5.0 (Linux; Android 6.0.1; Nexus 6 Build/MMB29V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36 -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36 Mediametric Mozilla/5.0 (Linux; U; Android 5.1; en-US; SLIDE2 Build/LMY47I) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.9.0.731 U3/0.8.0 Mobile Safari/534.30 Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; Sony Tablet S Build/TISU0143) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30 Opera/9.80 (SpreadTrum; Opera Mini/4.4.31492/37.8678; U; en) Presto/2.12.423 Version/12.16 @@ -122332,7 +122331,6 @@ Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_2 like Mac OS X) AppleWebKit/604.3.5 (KH Mozilla/5.0 (iPhone; CPU iPhone OS 11_1 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Mobile/15B93 (4347460352) Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A402 (4310753168) Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_2 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 (4695841712) -Polymail/1.47 (ctrlla.Polymail; build:64; OS X 10.12.6) Alamofire/4.5.1 Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 (4300266224) Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_3 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A432 (4761665200) Mozilla/5.0 (Linux; Android 6.0.1; SM-G900V Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Mobile Safari/537.36 @@ -156257,7 +156255,6 @@ Mozilla/5.0 (X11; CrOS x86_64 10032.86.0) AppleWebKit/537.36 (KHTML, like Gecko) Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) Mobile/15D100 [FBAN/FBIOS;FBAV/170.1.0.80.91;FBBV/106613464;FBDV/iPhone10,3;FBMD/iPhone;FBSN/iOS;FBSV/11.2.6;FBSS/3;FBCR/giffgaff;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/107080238] Mozilla/5.0 (Linux; Android 7.1.2; Swift 2 X Build/N2G47Z) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 Mozilla/5.0 (Linux; Android 6.0; XT1700 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 -Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3116.0 Mobile Safari/537.36 Chrome-Lighthouse Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36 Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_1 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C153 [FBAN/FBIOS;FBAV/164.0.0.56.96;FBBV/98434650;FBDV/iPhone9,3;FBMD/iPhone;FBSN/iOS;FBSV/11.2.1;FBSS/2;FBCR/O2;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/100018292] Mozilla/5.0 (X11; Linux i686) AppleWebKit/538.1 (KHTML, like Gecko) QupZilla/1.8.6 Safari/538.1 @@ -161339,7 +161336,6 @@ Mozilla/5.0 (Linux; Android 5.0.2; A0001 Build/LRX22G) AppleWebKit/537.36 (KHTML Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) Mobile/15D100 [FBAN/FBIOS;FBAV/162.0.0.47.94;FBBV/95649710;FBDV/iPhone8,4;FBMD/iPhone;FBSN/iOS;FBSV/11.2.6;FBSS/2;FBCR/Virgin;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/97201607] Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) Mobile/15D100 [FBAN/FBIOS;FBAV/164.0.0.56.96;FBBV/98434650;FBDV/iPhone8,4;FBMD/iPhone;FBSN/iOS;FBSV/11.2.6;FBSS/2;FBCR/giffgaff;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/100018292] Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 -Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3116.0 Safari/537.36 Chrome-Lighthouse Mozilla/5.0 (Linux; Android 7.1.1; ONEPLUS A3003 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/164.0.0.44.95;] Mozilla/5.0 (Linux; Android 5.1.1; SM-G361F Build/LMY48B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36 Mozilla/5.0 (Linux; Android 8.0.0; F8331 Build/41.3.A.0.401; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/64.0.3282.137 Mobile Safari/537.36 @@ -165981,3 +165977,5 @@ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.016.B1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/66.0.3359.158 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/171.0.0.49.92;] Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E302 [FBAN/FBIOS;FBAV/171.0.0.49.95;FBBV/107251038;FBDV/iPhone7,2;FBMD/iPhone;FBSN/iOS;FBSV/11.3.1;FBSS/2;FBCR/EE;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/108659124] +Mozilla/5.0 (Linux; Android 5.1; CRONO 54 Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 4.2.1; MAJESTIC Zeus21 Build/JOP40D) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Mobile Safari/537.36 \ No newline at end of file diff --git a/NetCrawlerDetect/NetCrawlerDetect.sln b/NetCrawlerDetect/NetCrawlerDetect.sln index ca3966d..791d203 100644 --- a/NetCrawlerDetect/NetCrawlerDetect.sln +++ b/NetCrawlerDetect/NetCrawlerDetect.sln @@ -22,8 +22,11 @@ Global EndGlobalSection GlobalSection(MonoDevelopProperties) = preSolution description = A .net standard port of JayBizzle's CrawlerDetect project (https://github.com/JayBizzle/Crawler-Detect). - version = 0.1.9 + version = 0.2.0 Policies = $0 $0.VersionControlPolicy = $1 + $0.DotNetNamingPolicy = $2 + $2.DirectoryNamespaceAssociation = PrefixedHierarchical + $0.StandardHeader = $3 EndGlobalSection EndGlobal diff --git a/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Crawlers.cs b/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Crawlers.cs index dd10c59..3ae017b 100644 --- a/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Crawlers.cs +++ b/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Crawlers.cs @@ -15,7 +15,7 @@ public Crawlers() // Collection of user-agent regex snippets _data = new List() { - @"outbrain", + @".*Java.*outbrain", @" YLT", @"^b0t$", @"^bluefish ", @@ -64,8 +64,10 @@ public Crawlers() @"adbeat", @"AddThis", @"ADmantX", + @"AdminLabs", @"adressendeutschland", @"adscanner", + @"Adstxtaggregator", @"agentslug", @"AHC", @"aihit", @@ -214,6 +216,7 @@ public Crawlers() @"colly -", @"CommaFeed", @"Commons-HttpClient", + @"commonscan", @"contactbigdatafr", @"contentkingapp", @"convera", @@ -242,6 +245,8 @@ public Crawlers() @"dataprovider", @"DataXu", @"Daum(oa)?[ \/][0-9]", + @"ddline", + @"deeris", @"Demon", @"DeuSu", @"developers\.google\.com\/\+\/web\/snippet\/", @@ -250,6 +255,7 @@ public Crawlers() @"Digincore", @"DigitalPebble", @"Dirbuster", + @"Discourse Forum Onebox", @"Disqus\/", @"Dispatch\/", @"DittoSpyder", @@ -314,6 +320,7 @@ public Crawlers() @"Faveeo", @"Favicon downloader", @"faviconkit", + @"faviconarchive", @"FavOrg", @"Feed Wrangler", @"Feedable\/", @@ -322,6 +329,7 @@ public Crawlers() @"FeedBucket", @"FeedBunch\/", @"FeedBurner", + @"feeder", @"Feedly", @"FeedshowOnline", @"Feedspot", @@ -346,7 +354,7 @@ public Crawlers() @"forensiq", @"FoundSeoTool", - // 'Francis [Bot]' + //'Francis [Bot]' @"http:\/\/www.neomo.de\/", @"free thumbnails", @"Freeuploader", @@ -416,6 +424,7 @@ public Crawlers() @"Grafula", @"Grammarly", @"GrapeFX", + @"GreatNews", @"Gregarius", @"GRequests", @"grokkit", @@ -430,6 +439,7 @@ public Crawlers() @"Haansoft", @"hackney\/", @"Hadi Agent", + @"HappyApps-WebCheck", @"Hatena", @"Havij", @"HeadlessChrome", @@ -476,7 +486,6 @@ public Crawlers() @"huaweisymantec", @"HubSpot ", @"Humanlinks", - @"HyperZbozi\.cz Feeder", @"i2kconnect\/", @"Iblog", @"ichiro", @@ -577,6 +586,7 @@ public Crawlers() @"Licorne", @"Liferea\/", @"Lightspeedsystems", + @"Lighthouse", @"Likse", @"Link Valet", @"link_thumbnailer", @@ -620,11 +630,13 @@ public Crawlers() @"Mass Downloader", @"masscan\/", @"Mata Hari", + @"Mediametric", @"Mediapartners-Google", @"mediawords", @"MegaIndex\.ru", @"MeltwaterNews", @"Melvil Rawi", + @"MemGator", @"Metaspinner", @"MetaURI", @"MFC_Tear_Sample", @@ -708,6 +720,7 @@ public Crawlers() @"Nodemeter", @"NodePing", @"nominet\.org\.uk", + @"nominet\.uk", @"Norton-Safeweb", @"Notifixious", @"notifyninja", @@ -715,6 +728,7 @@ public Crawlers() @"nutch", @"Nuzzel", @"nWormFeedFinder", + @"nyawc\/", @"Nymesis", @"NYU", @"Ocelli\/", @@ -737,6 +751,7 @@ public Crawlers() @"OrgProbe\/", @"orion-semantics", @"Outlook-Express", + @"Outlook-iOS", @"ow\.ly", @"Owler", @"ownCloud News", @@ -789,6 +804,7 @@ public Crawlers() @"PocketParser", @"Pockey", @"POE-Component-Client-HTTP", + @"Polymail\/", @"Pompos", @"Porkbun", @"Port Monitor", @@ -830,6 +846,7 @@ public Crawlers() @"RankFlex", @"RankSonicSiteAuditor", @"Re-re Studio", + @"ReactorNetty", @"Readability", @"RealDownload", @"RealPlayer%20Downloader", @@ -841,6 +858,7 @@ public Crawlers() @"ReGet", @"RepoMonkey", @"request\.js", + @"reqwest\/", @"ResponseCodeTest", @"RestSharp", @"Riddler", @@ -894,6 +912,7 @@ public Crawlers() @"SEOprofiler", @"SEOsearch", @"seoscanners", + @"seositecheckup", @"SEOstats", @"servernfo", @"sexsearcher", @@ -1001,7 +1020,9 @@ public Crawlers() @"teoma", @"terrainformatica", @"Test Certificate Info", + @"testuri", @"Tetrahedron", + @"TextRazor Downloader", @"The Drop Reaper", @"The Expert HTML Source Viewer", @"The Knowledge AI", @@ -1020,6 +1041,7 @@ public Crawlers() @"touche\.com", @"Traackr\.com", @"tracemyfile", + @"Trackuity", @"TrapitAgent", @"Trendiction", @"Trendsmap", @@ -1041,6 +1063,7 @@ public Crawlers() @"ubermetrics-technologies", @"uclassify", @"UdmSearch", + @"unchaos", @"unirest-java", @"UniversalFeedParser", @"Unshorten\.It", @@ -1050,10 +1073,12 @@ public Crawlers() @"updown\.io daemon", @"Upflow", @"Uptimia", + @"Urlcheckr", @"URL Verifier", @"URLitor", @"urlresolver", @"Urlstat", + @"URLTester", @"UrlTrends Ranking Updater", @"URLy Warning", @"URLy\.Warning", @@ -1096,6 +1121,7 @@ public Crawlers() @"Web Fuck", @"Web Pix", @"Web Sauger", + @"Web spyder", @"Web Sucker", @"Webalta", @"Webauskunft", @@ -1111,6 +1137,7 @@ public Crawlers() @"WebEnhancer", @"WebFetch", @"WebFuck", + @"WebGazer", @"WebGo IS", @"WebImageCollector", @"WebImages", @@ -1129,6 +1156,7 @@ public Crawlers() @"websitepulse agent", @"WebsiteQuester", @"Websnapr", + @"WebSniffer", @"Webster", @"WebStripper", @"WebSucker", @@ -1154,6 +1182,7 @@ public Crawlers() @"wkhtmlto", @"wmtips", @"Woko", + @"woorankreview", @"Word\/", @"WordPress\/", @"WordupinfoSearch", @@ -1203,7 +1232,7 @@ public Crawlers() @"Zend_Http_Client", @"Zend\\\\Http\\\\Client", @"Zermelo", - @"Zeus", + @"Zeus ", @"zgrab", @"ZnajdzFoto", @"Zombie\.js", diff --git a/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Exclusions.cs b/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Exclusions.cs index 344ac4c..6ab8248 100644 --- a/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Exclusions.cs +++ b/NetCrawlerDetect/NetCrawlerDetect/Fixtures/Exclusions.cs @@ -58,6 +58,7 @@ public Exclusions() @" \.NET[\d\.]*", @"cubot", @"; M bot", + @"; CRONO", @"; B bot", @"; IDbot", @"; ID bot", diff --git a/NetCrawlerDetect/NetCrawlerDetect/NetCrawlerDetect.csproj b/NetCrawlerDetect/NetCrawlerDetect/NetCrawlerDetect.csproj index bf0be32..3307dd6 100644 --- a/NetCrawlerDetect/NetCrawlerDetect/NetCrawlerDetect.csproj +++ b/NetCrawlerDetect/NetCrawlerDetect/NetCrawlerDetect.csproj @@ -2,8 +2,8 @@ netstandard2.0 - 0.1.9 - 0.1.9 + 0.2.0 + 0.2.0 Graham "Gee" Plumb https://github.com/gplumb/NetCrawlerDetect/blob/master/LICENSE Graham "Gee" Plumb @@ -12,7 +12,7 @@ A .net standard port of JayBizzle's CrawlerDetect project (https://github.com/JayBizzle/Crawler-Detect). true NetCrawlerDetect - Performance and general clean up (https://github.com/JayBizzle/Crawler-Detect/pull/312) + Bot additions and removal of false positives (up to https://github.com/JayBizzle/Crawler-Detect/pull/325) A .net standard port of JayBizzle's CrawlerDetect project (https://github.com/JayBizzle/Crawler-Detect).