From 0319fcf00bfd8ae3e6c434df3eb6fd64bacf4293 Mon Sep 17 00:00:00 2001 From: crackcomm Date: Sun, 17 Jan 2016 11:59:56 +0100 Subject: [PATCH 1/2] bot: check for in user agent --- bot.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bot.go b/bot.go index df0b6cc..8780bf4 100644 --- a/bot.go +++ b/bot.go @@ -75,6 +75,8 @@ func (p *UserAgent) fixOther(sections []section) { } } +var botRegex = regexp.MustCompile("(?i)(bot|crawler|sp(i|y)der|search|worm|fetch|nutch)") + // Check if we're dealing with a bot or with some weird browser. If that is the // case, the receiver will be modified accordingly. func (p *UserAgent) checkBot(sections []section) { @@ -83,9 +85,8 @@ func (p *UserAgent) checkBot(sections []section) { if len(sections) == 1 && sections[0].name != "Mozilla" { p.mozilla = "" - // Check whether the name has some suspicious "bot" in his name. - reg, _ := regexp.Compile("(?i)bot") - if reg.Match([]byte(sections[0].name)) { + // Check whether the name has some suspicious "bot" or "crawler" in his name. + if botRegex.Match([]byte(sections[0].name)) { p.setSimple(sections[0].name, "", true) return } From e25e612b37a4206633f33fe4a6810d3f9a17c745 Mon Sep 17 00:00:00 2001 From: crackcomm Date: Sun, 17 Jan 2016 12:38:43 +0100 Subject: [PATCH 2/2] tests: more robots --- all_test.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/all_test.go b/all_test.go index 2844d48..ae1b5c8 100644 --- a/all_test.go +++ b/all_test.go @@ -62,6 +62,26 @@ var uastrings = []struct { ua: "Facebot", expected: "Browser:Facebot Bot:true Mobile:false", }, + { + title: "NutchCVS", + ua: "NutchCVS/0.8-dev (Nutch; http://lucene.apache.org/nutch/bot.html; nutch-agent@lucene.apache.org)", + expected: "Browser:NutchCVS Bot:true Mobile:false", + }, + { + title: "MJ12bot", + ua: "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php?+)", + expected: "Mozilla:5.0 Browser:MJ12bot-v1.2.4 Bot:true Mobile:false", + }, + { + title: "MJ12bot", + ua: "MJ12bot/v1.0.8 (http://majestic12.co.uk/bot.php?+)", + expected: "Browser:MJ12bot Bot:true Mobile:false", + }, + { + title: "AhrefsBot", + ua: "Mozilla/5.0 (compatible; AhrefsBot/4.0; +http://ahrefs.com/robot/)", + expected: "Mozilla:5.0 Browser:AhrefsBot-4.0 Bot:true Mobile:false", + }, // Internet Explorer {