[safety] Support the malwaredomains.com blacklist

malwaredomains.com is a blacklist of known malware domains. This commit adds support for this blacklist, and a new mode for the module to preform local lookups only - for private channels where privacy is important.
maxpowa · Sep 7, 2014 · cead760 · cead760
1 parent 0168406
commit cead760
Showing 1 changed file with 69 additions and 31 deletions.
diff --git a/safety.py b/safety.py
@@ -6,8 +6,8 @@
 
 This module uses virustotal.com
 """
-#TODO malwaredomains.com support
 from __future__ import unicode_literals
+from __future__ import print_function
 import willie.web as web
 from willie.config import ConfigurationError
 from willie.formatting import color, bold
@@ -16,11 +16,18 @@
 import sys
 import json
 import time
+import os.path
 
 if sys.version_info.major > 2:
     unicode = str
+    from urllib.request import urlretrieve
+    from urllib.parse import urlparse
+else:
+    from urllib import urlretrieve
+    from urlparse import urlparse
 
 vt_base_api_url = 'https://www.virustotal.com/vtapi/v2/url/'
+malware_domains = []
 
 
 def configure(config):
@@ -31,7 +38,7 @@ def configure(config):
     | enabled_by_default | True | Enable safety on implicity on channels |
     | vt_api_key | ea4ca709a686edfcc96a144c224935776e2ba46b77 | VirusTotal API key |
     """
-    if config.option('Configure malicious URL protection'):
+    if config.option('Configure malicious URL protection?'):
         config.add_section('safety')
         config.add_option('safety', 'enabled_by_default', 'Enable malicious URL checking for channels by default?', True)
         config.interactive_add('safety', 'vt_api_key', 'VirusTotal API Key (not mandatory)', None)
@@ -43,15 +50,32 @@ def setup(bot):
     if bot.db and not bot.db.preferences.has_columns('safety'):
         bot.db.preferences.add_columns(['safety'])
     bot.memory['safety_cache'] = willie.tools.WillieMemory()
+    loc = os.path.join(bot.config.homedir, 'malwaredomains.txt')
+    if os.path.isfile(loc):
+        if os.path.getmtime(loc) < time.time() - 24 * 60 * 60 * 7:
+            # File exists but older than one week, update
+            _download_malwaredomains_db(loc)
+    else:
+        _download_malwaredomains_db(loc)
+    with open(loc, 'r') as f:
+        for line in f:
+            malware_domains.append(unicode(line).strip())
+
+
+def _download_malwaredomains_db(path):
+    print('Downloading malwaredomains db…')
+    urlretrieve('http://mirror1.malwaredomains.com/files/justdomains', path)
 
 
 @willie.module.rule('(?u).*(https?://\S+).*')
+@willie.module.priority('high')
 def url_handler(bot, trigger):
     """ Check for malicious URLs """
-    check = True  # Enable URL checking
+    check = True    # Enable URL checking
     strict = False  # Strict mode: kick on malicious URL
-    positives = 0  # Number of engines saying it's malicious
-    total = 0  # Number of total engines
+    positives = 0   # Number of engines saying it's malicious
+    total = 0       # Number of total engines
+    use_vt = True   # Use VirusTotal
     if bot.config.has_section('safety'):
         check = bot.config.safety.enabled_by_default
         if check is None:
@@ -64,41 +88,55 @@ def url_handler(bot, trigger):
         setting = bot.db.preferences.get(trigger.sender.lower(), 'safety')
         if setting == 'off':
             return  # Not checking
-        elif setting in ['on', 'strict']:
+        elif setting in ['on', 'strict', 'local', 'local strict']:
             check = True
-        if setting == 'strict':
+        if setting == 'strict' or setting == 'local strict':
             strict = True
+        if setting == 'local' or setting == 'local strict':
+            use_vt = False
 
     if not check:
         return  # Not overriden by DB, configured default off
 
     apikey = bot.config.safety.vt_api_key
-    if apikey is not None:
-        payload = {'resource': unicode(trigger), 'apikey': apikey, 'scan': '1'}
-
-        if trigger not in bot.memory['safety_cache']:
-            result = web.post(vt_base_api_url+'report', payload)
-            if sys.version_info.major > 2:
-                result = result.decode('utf-8')
-            result = json.loads(result)
-            age = time.time()
-            bot.memory['safety_cache'][trigger] = {'positives':
-                                                   result['positives'],
-                                                   'total': result['total'],
-                                                   'age': age}
-            if len(bot.memory['safety_cache']) > 1024:
-                _clean_cache(bot)
-        else:
-            print('using cache')
-            result = bot.memory['safety_cache'][trigger]
-        positives = result['positives']
-        total = result['total']
-        perecent_sure = '{}%'.format(round((positives / total) * 100))
+    try:
+        if apikey is not None and use_vt:
+            payload = {'resource': unicode(trigger),
+                       'apikey': apikey,
+                       'scan': '1'}
+
+            if trigger not in bot.memory['safety_cache']:
+                result = web.post(vt_base_api_url+'report', payload)
+                if sys.version_info.major > 2:
+                    result = result.decode('utf-8')
+                result = json.loads(result)
+                age = time.time()
+                data = {'positives': result['positives'],
+                        'total': result['total'],
+                        'age': age}
+                bot.memory['safety_cache'][trigger] = data
+                if len(bot.memory['safety_cache']) > 1024:
+                    _clean_cache(bot)
+            else:
+                print('using cache')
+                result = bot.memory['safety_cache'][trigger]
+            positives = result['positives']
+            total = result['total']
+    except Exception as e:
+        bot.debug('[safety]', e, 'debug')
+        pass  # Ignoring exceptions with VT so MalwareDomains will always work
+
+    if unicode(urlparse(trigger).netloc) in malware_domains:
+        # malwaredomains is more trustworthy than some VT engines
+        # therefor it gets a weight of 10 engines when calculating confidence
+        positives += 10
+        total += 10
 
     if positives > 1:
         # Possibly malicious URL detected!
+        confidence = '{}%'.format(round((positives / total) * 100))
         msg = 'link posted by %s is possibliy malicious ' % bold(trigger.nick)
-        msg += '(confidence %s - %s/%s)' % (perecent_sure, positives, total)
+        msg += '(confidence %s - %s/%s)' % (confidence, positives, total)
         bot.say('[' + bold(color('WARNING', 'red')) + '] ' + msg)
         if strict:
             bot.write(['KICK', trigger.sender, trigger.nick,
@@ -107,8 +145,8 @@ def url_handler(bot, trigger):
 
 @willie.module.commands('safety')
 def toggle_safety(bot, trigger):
-    """ Toggle safety setting for channel """
-    allowed_states = ['strict', 'on', 'off']
+    """ Set safety setting for channel """
+    allowed_states = ['strict', 'on', 'off', 'local', 'local strict']
     if not trigger.group(2) or trigger.group(2).lower() not in allowed_states:
         bot.reply('Available options: strict / on /off')
         return