From fc719ffb198bf3bcd0c59bd837e05f7bd71e852a Mon Sep 17 00:00:00 2001
From: damoklov <mishanya@protonmail.com>
Date: Sun, 4 Oct 2020 17:08:23 +0300
Subject: [PATCH 1/4] Added .gitignore file for Python

---
 .gitignore | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4bf2a22
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,141 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+.idea/
\ No newline at end of file

From 2a29c31cb1d88dad50cc8aa722987c8b3e6df064 Mon Sep 17 00:00:00 2001
From: damoklov <mishanya@protonmail.com>
Date: Sun, 4 Oct 2020 17:08:47 +0300
Subject: [PATCH 2/4] Added initial adaptation of yabin for Python3

---
 yabin3.py | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100644 yabin3.py

diff --git a/yabin3.py b/yabin3.py
new file mode 100644
index 0000000..66560a0
--- /dev/null
+++ b/yabin3.py
@@ -0,0 +1,302 @@
+'''
+
+ YaraBin (Yara + Binary)
+
+ This generates Yara rules from function prologs, for matching and hunting
+
+ Questions of comments? Hit me up @chrisdoman
+
+'''
+
+import binascii
+import re
+import os
+import argparse
+import math
+import sqlite3
+import hashlib
+
+# What percent overlap required for two malware samples considered to be the same family?
+# From 0 (0%) to 1 (100%). A large number means a tighter yara signature
+# will be created
+percent_tight_match = 0.8
+
+seen_patterns = {}
+conn = sqlite3.connect('db.db')
+db = conn.cursor()
+db.execute('PRAGMA synchronous=OFF')
+
+
+def parseArguments():
+
+    parser = argparse.ArgumentParser(
+        description='Yabin - Signatures and searches malware')
+    parser.add_argument('-y', '--yara', help='Generate yara rule for the file or folder', required=False)
+    parser.add_argument('-yh', '--yaraHunt', help='Generate wide yara rule (any of, not all of).\r\n Useful for hunting for related samples or potentially malicious files that share any of the code - but liable to false positive', required=False)
+    parser.add_argument('-d', '--deleteDatabase',help='Empty the whitelist and malware database',  action='store_true')
+    parser.add_argument('-w', '--addToWhitelist',help='Add a file or folder to the whitelist',  required=False)
+    parser.add_argument('-f', '--fuzzyHash', help='Generate a fuzzy hash for the file', required=False)
+    parser.add_argument('-m', '--malwareAdd', help='Add malware file or folder to malware database to be searched', required=False)
+    parser.add_argument('-s', '--malwareSearch', help='Search for samples related to this file', required=False)
+
+    args = vars(parser.parse_args())
+
+    if args['yara']:
+        yara(args['yara'])
+    if args['yaraHunt']:
+        yara(args['yaraHunt'], False)
+    if args['deleteDatabase']:
+        deleteDatabase()
+    if args['addToWhitelist']:
+        addToWhitelist(args['addToWhitelist'])
+    if args['fuzzyHash']:
+        fuzzyHash(args['fuzzyHash'])
+    if args['malwareAdd']:
+        addMalware(args['malwareAdd'])
+    if args['malwareSearch']:
+        malwareSearch(args['malwareSearch'])
+
+
+def getBytePatterns(filename, ignore_whitelist=False):
+    with open(filename, 'rb') as f:
+        content = f.read()
+    hex = binascii.hexlify(content).decode('utf-8')
+    # Add - every two characters so we match -xx- not x-x
+    hex = 'x'.join([hex[i:i + 2] for i in range(0, len(hex), 2)])
+    seen = {}
+    for match in re.findall(prolog_regex, hex):
+        bit = match[0].replace('x', '')
+        if bit not in seen:
+            if ignore_whitelist or not whitelisted(bit):
+                # Only include high entropy patterns, ie) avoid 0000000 or
+                # 1111111 etc.
+                # if entropy(bit) > 0:
+                seen[bit] = entropy(bit)
+
+    return seen
+
+
+def loadProlog():
+    prolog_regex = '('
+    with open('regex.txt') as file:
+        for l in file.readlines():
+            line = l.strip()
+            if not line.startswith('#'):
+                if len(line) > 3:
+                    prolog_regex += line + '|'
+    prolog_regex += ')'
+    prolog_regex = prolog_regex.replace('|)', ')')
+    return prolog_regex
+
+# Get the shannon entropy of a string
+
+
+def entropy(string):
+    prob = [float(string.count(c)) / len(string)
+            for c in dict.fromkeys(list(string))]
+    entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
+    return entropy
+
+
+def generateFuzzyHash(filename):
+    # Print out those that aren't in the whitelist
+    byte_patterns = getBytePatterns(filename)
+    patterns = []
+    for s in byte_patterns:
+        patterns.append(s)
+
+    patterns.sort()
+    # Just print the first sorted pattern... a vey poor mans fuzzy hash
+    for s in patterns:
+        print(filename + ',' + s)
+        return
+
+
+def generateYara(filename, singleFile, tight=True, max_lines=3000, min_patterns=0):
+    global seen_patterns
+    global percent_tight_match
+
+    # Print out those that aren't in the whitelist
+    byte_patterns = getBytePatterns(filename)
+
+    if tight:
+        # Dont print the same rule twice
+        if str(byte_patterns) not in seen_patterns:
+            seen_patterns[str(byte_patterns)] = 1
+            # If we have no, or only one pattern, it probably won't be a tight
+            # enough signature
+            if len(byte_patterns) > min_patterns:
+                print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {')
+                print(' strings:')
+
+                count = 1
+                for s in byte_patterns:
+                    if count < max_lines:
+                        count += 1
+                        print('  $a_' + str(count) + ' = { ' + s + ' }')
+
+                print(' condition:')
+                tight_decimal = int(round(count * percent_tight_match))
+                print('  ' + str(tight_decimal) + ' of them')
+                print('}')
+                print('\r\n\r\n')
+
+    if not tight:
+        # Dont print the same rule twice
+        if str(byte_patterns) not in seen_patterns:
+            seen_patterns[str(byte_patterns)] = 1
+            # If we have no, or only one pattern, it probably won't be a tight
+            # enough signature
+            if len(byte_patterns) > min_patterns:
+                print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {')
+                print(' strings:')
+
+                count = 1
+                for s in byte_patterns:
+                    if count < max_lines:
+                        count += 1
+                        print('  $a_' + str(count) + ' = { ' + s + ' }')
+
+                print(' condition:')
+                tight_decimal = int(round(count * percent_tight_match))
+                print('  any of them')
+                print('}')
+                print('\r\n\r\n')
+
+
+def fuzzyHash(filename, tight=True):
+    if os.path.isdir(filename):
+        for f in os.listdir(filename):
+            generateFuzzyHash('./' + filename + '/' + f)
+    else:
+        if os.path.isfile(filename):
+            generateFuzzyHash(filename)
+
+
+def yara(filename, tight=True):
+    if os.path.isdir(filename):
+        for f in os.listdir(filename):
+            generateYara('./' + filename + '/' + f, False, tight)
+    else:
+        if os.path.isfile(filename):
+            generateYara(filename, True, tight)
+
+# Returns true if a pattern is whitelisted
+
+
+def whitelisted(pattern):
+    db.execute('SELECT * FROM whitelist WHERE pattern ="' + pattern + '"')
+    result = db.fetchone()
+    if result is None:
+        return False
+    return True
+
+
+def addToWhitelist(folder):
+    # Minimum number of samples a pattern must be in
+    min_seen = 1
+    count = 0
+
+    # If we dont care how often it's been seen, just insert it
+    if min_seen == 0:
+        for f in os.listdir(folder):
+            count = count + 1
+            print('Processed ' + str(count) + ' file(s)')
+            print('Processing ' + f)
+            new_seen = getBytePatterns('./' + folder + '/' + f, True)
+            for pattern in new_seen:
+                db.execute(
+                    'insert or ignore into whitelist (pattern) values ("' + pattern + '")')
+            conn.commit()
+
+    # Otherwise actually keep track of how many samples a pattern has been in
+    else:
+        seen = {}
+        # Built a count of how often every pattern was seen
+        for f in os.listdir(folder):
+            count = count + 1
+            print('Processed ' + str(count) + ' file(s)')
+            new_seen = getBytePatterns('./' + folder + '/' + f, True)
+            for pattern in new_seen:
+                if pattern not in seen:
+                    seen[pattern] = 1
+                else:
+                    seen[pattern] = seen[pattern] + 1
+
+        total = 0
+        # Insert every pattern seen > x times into the whtelist
+        for pattern, count in seen.items():
+            if count > min_seen:
+                total = total + 1
+                db.execute('insert or ignore into whitelist (pattern) values ("' + pattern + '")')
+
+    conn.commit()
+
+
+def generateSample(filename):
+    md5 = hashlib.md5(open(filename, 'rb').read()).hexdigest()
+    # Print out those that aren't in the whitelist
+    byte_patterns = getBytePatterns(filename)
+    for pattern in byte_patterns:
+        db.execute('insert or ignore into malware (pattern, md5) values ("' +
+                   pattern + '", "' + md5 + '")')
+
+
+def deleteDatabase():
+    db.execute('DROP TABLE IF EXISTS whitelist')
+    db.execute('DROP TABLE IF EXISTS malware')
+    db.execute('CREATE TABLE whitelist (pattern text)')
+    db.execute('CREATE UNIQUE INDEX whitelist_index on whitelist (pattern)')
+    db.execute('CREATE TABLE malware (pattern text, md5 text)')
+    db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5)')
+
+
+# Add a file or folder to malware db
+def addMalware(filename):
+    print('Adding samples to malware database')
+    if os.path.isdir(filename):
+        for f in os.listdir(filename):
+            generateSample('./' + filename + '/' + f)
+    else:
+        if os.path.isfile(filename):
+            generateSample(filename)
+    conn.commit()
+    print('Added samples')
+
+
+# For every pattern in file, find related
+def malwareSearch(filename):
+    md5 = hashlib.md5(open(filename,'rb').read()).hexdigest()
+    pattern_lookups = {}
+    found_samples = set()
+
+    # Print out those that aren't in the whitelist
+    byte_patterns = getBytePatterns(filename)
+    for pattern in byte_patterns:
+        related_samples = findRelated(pattern)
+
+        for sample in related_samples:
+            if sample not in found_samples and sample !=  md5:
+                found_samples.add(sample)
+                pattern_lookups[sample] = pattern
+
+    if len(found_samples) > 0:
+        print('Found related samples:')
+        for sample in found_samples:
+            print(sample + ' matched via ' + pattern_lookups[sample])
+    else:
+        print('No related samples found')
+
+def findRelated(pattern):
+    db.execute('SELECT md5 FROM malware WHERE pattern ="' + pattern + '"')
+    rows = db.fetchall()
+    toReturn = []
+    for row in rows:
+        toReturn.append(row[0])
+
+    return toReturn
+
+# This regex decides what patterns we will extract
+prolog_regex = loadProlog()
+
+parseArguments()

From 483d55fd8e48c368f33c18d5afa3fe3c4306cf4f Mon Sep 17 00:00:00 2001
From: damoklov <mishanya@protonmail.com>
Date: Fri, 9 Oct 2020 14:38:23 +0300
Subject: [PATCH 3/4] Added .gitignore

---
 .gitignore | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4bf2a22..e06f756 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,4 +138,14 @@ dmypy.json
 cython_debug/
 
 # PyCharm
-.idea/
\ No newline at end of file
+.idea/
+
+# VSCode
+.vscode/
+
+# Database file
+db.db
+
+# Directories with samples
+clean/
+malware/

From 5257cea3da9b485b343ee9a5245a220dcddba968 Mon Sep 17 00:00:00 2001
From: damoklov <mishanya@protonmail.com>
Date: Fri, 9 Oct 2020 14:51:05 +0300
Subject: [PATCH 4/4] Updated entropy and database entries

---
 yabin3.py | 106 ++++++++++++++++++++++++------------------------------
 1 file changed, 47 insertions(+), 59 deletions(-)

diff --git a/yabin3.py b/yabin3.py
index 66560a0..8758cdd 100644
--- a/yabin3.py
+++ b/yabin3.py
@@ -38,6 +38,7 @@ def parseArguments():
     parser.add_argument('-f', '--fuzzyHash', help='Generate a fuzzy hash for the file', required=False)
     parser.add_argument('-m', '--malwareAdd', help='Add malware file or folder to malware database to be searched', required=False)
     parser.add_argument('-s', '--malwareSearch', help='Search for samples related to this file', required=False)
+    parser.add_argument('-mf', '--malwareFamily', help='Name of malware family to be added', required=False, action='store', type=str, const="no_family", nargs='?')
 
     args = vars(parser.parse_args())
 
@@ -52,7 +53,7 @@ def parseArguments():
     if args['fuzzyHash']:
         fuzzyHash(args['fuzzyHash'])
     if args['malwareAdd']:
-        addMalware(args['malwareAdd'])
+        addMalware(args['malwareAdd'], args['malwareFamily'])
     if args['malwareSearch']:
         malwareSearch(args['malwareSearch'])
 
@@ -63,16 +64,15 @@ def getBytePatterns(filename, ignore_whitelist=False):
     hex = binascii.hexlify(content).decode('utf-8')
     # Add - every two characters so we match -xx- not x-x
     hex = 'x'.join([hex[i:i + 2] for i in range(0, len(hex), 2)])
-    seen = {}
-    for match in re.findall(prolog_regex, hex):
+    seen = {} # stores values {"opcode_sequence": float(entropy)}
+    for match in re.findall(prolog_regex, hex): # prolog_regex here is in form of OR statement (<pattern-1>|<pattern-2>|<pattern-3>|...)
         bit = match[0].replace('x', '')
         if bit not in seen:
             if ignore_whitelist or not whitelisted(bit):
                 # Only include high entropy patterns, ie) avoid 0000000 or
                 # 1111111 etc.
-                # if entropy(bit) > 0:
-                seen[bit] = entropy(bit)
-
+                if entropy(bit) > 0:
+                    seen[bit] = entropy(bit)
     return seen
 
 
@@ -112,56 +112,39 @@ def generateFuzzyHash(filename):
         return
 
 
-def generateYara(filename, singleFile, tight=True, max_lines=3000, min_patterns=0):
+def generateYara(filename, singleFile, tight=True, max_lines=75, min_patterns=0):
     global seen_patterns
     global percent_tight_match
 
     # Print out those that aren't in the whitelist
     byte_patterns = getBytePatterns(filename)
 
-    if tight:
+    def form_rule(tight=True):
         # Dont print the same rule twice
         if str(byte_patterns) not in seen_patterns:
             seen_patterns[str(byte_patterns)] = 1
             # If we have no, or only one pattern, it probably won't be a tight
             # enough signature
-            if len(byte_patterns) > min_patterns:
-                print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {')
+            if len(byte_patterns.keys()) > min_patterns:
+                print('rule tight_' + os.path.basename(filename).replace('/', '_').replace('.', '') + ' {')
                 print(' strings:')
 
                 count = 1
                 for s in byte_patterns:
-                    if count < max_lines:
-                        count += 1
+                    if count < max_lines and entropy(s) > 3.25: # added entropy here
                         print('  $a_' + str(count) + ' = { ' + s + ' }')
-
-                print(' condition:')
-                tight_decimal = int(round(count * percent_tight_match))
-                print('  ' + str(tight_decimal) + ' of them')
-                print('}')
-                print('\r\n\r\n')
-
-    if not tight:
-        # Dont print the same rule twice
-        if str(byte_patterns) not in seen_patterns:
-            seen_patterns[str(byte_patterns)] = 1
-            # If we have no, or only one pattern, it probably won't be a tight
-            # enough signature
-            if len(byte_patterns) > min_patterns:
-                print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {')
-                print(' strings:')
-
-                count = 1
-                for s in byte_patterns:
-                    if count < max_lines:
                         count += 1
-                        print('  $a_' + str(count) + ' = { ' + s + ' }')
 
                 print(' condition:')
-                tight_decimal = int(round(count * percent_tight_match))
-                print('  any of them')
+                tight_decimal = int(math.floor(count * percent_tight_match))
+                if tight:
+                    print('  ' + str(tight_decimal) + ' of them')
+                else:
+                    print('  any of them')
                 print('}')
                 print('\r\n\r\n')
+    
+    form_rule(tight=tight)
 
 
 def fuzzyHash(filename, tight=True):
@@ -194,20 +177,23 @@ def whitelisted(pattern):
 
 def addToWhitelist(folder):
     # Minimum number of samples a pattern must be in
-    min_seen = 1
+    min_seen = 0
     count = 0
 
     # If we dont care how often it's been seen, just insert it
     if min_seen == 0:
         for f in os.listdir(folder):
             count = count + 1
-            print('Processed ' + str(count) + ' file(s)')
-            print('Processing ' + f)
-            new_seen = getBytePatterns('./' + folder + '/' + f, True)
-            for pattern in new_seen:
-                db.execute(
-                    'insert or ignore into whitelist (pattern) values ("' + pattern + '")')
-            conn.commit()
+            if int(os.path.getsize('./' + folder + '/' + f)) < 35000000: #bytes
+                print('Processed ' + str(count) + ' file(s)')
+                print('Processing ' + f)
+                new_seen = getBytePatterns('./' + folder + '/' + f, True)
+                for pattern in new_seen:
+                    db.execute(
+                        'insert or ignore into whitelist (pattern) values ("' + pattern + '")') # will not insert similar patterns, ok
+                conn.commit()
+            else:
+                continue
 
     # Otherwise actually keep track of how many samples a pattern has been in
     else:
@@ -215,13 +201,16 @@ def addToWhitelist(folder):
         # Built a count of how often every pattern was seen
         for f in os.listdir(folder):
             count = count + 1
-            print('Processed ' + str(count) + ' file(s)')
-            new_seen = getBytePatterns('./' + folder + '/' + f, True)
-            for pattern in new_seen:
-                if pattern not in seen:
-                    seen[pattern] = 1
-                else:
-                    seen[pattern] = seen[pattern] + 1
+            if int(os.path.getsize('./' + folder + '/' + f)) < 35000000: #bytes
+                print('Processed ' + str(count) + ' file(s)')
+                new_seen = getBytePatterns('./' + folder + '/' + f, True)
+                for pattern in new_seen:
+                    if pattern not in seen:
+                        seen[pattern] = 1
+                    else:
+                        seen[pattern] = seen[pattern] + 1
+            else:
+                continue
 
         total = 0
         # Insert every pattern seen > x times into the whtelist
@@ -233,13 +222,13 @@ def addToWhitelist(folder):
     conn.commit()
 
 
-def generateSample(filename):
+def generateSample(filename, family):
     md5 = hashlib.md5(open(filename, 'rb').read()).hexdigest()
     # Print out those that aren't in the whitelist
     byte_patterns = getBytePatterns(filename)
     for pattern in byte_patterns:
-        db.execute('insert or ignore into malware (pattern, md5) values ("' +
-                   pattern + '", "' + md5 + '")')
+        db.execute('insert or ignore into malware (pattern, md5, family) values ("' +
+                    pattern + '", "' + md5 + '", "' + family + '")')
 
 
 def deleteDatabase():
@@ -247,19 +236,18 @@ def deleteDatabase():
     db.execute('DROP TABLE IF EXISTS malware')
     db.execute('CREATE TABLE whitelist (pattern text)')
     db.execute('CREATE UNIQUE INDEX whitelist_index on whitelist (pattern)')
-    db.execute('CREATE TABLE malware (pattern text, md5 text)')
-    db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5)')
-
+    db.execute('CREATE TABLE malware (pattern text, md5 text, family text)')
+    db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5, family)')
 
 # Add a file or folder to malware db
-def addMalware(filename):
+def addMalware(filename, family):
     print('Adding samples to malware database')
     if os.path.isdir(filename):
         for f in os.listdir(filename):
-            generateSample('./' + filename + '/' + f)
+            generateSample('./' + filename + '/' + f, family)
     else:
         if os.path.isfile(filename):
-            generateSample(filename)
+            generateSample(filename, family)
     conn.commit()
     print('Added samples')