From fc719ffb198bf3bcd0c59bd837e05f7bd71e852a Mon Sep 17 00:00:00 2001 From: damoklov Date: Sun, 4 Oct 2020 17:08:23 +0300 Subject: [PATCH 1/4] Added .gitignore file for Python --- .gitignore | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4bf2a22 --- /dev/null +++ b/.gitignore @@ -0,0 +1,141 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +.idea/ \ No newline at end of file From 2a29c31cb1d88dad50cc8aa722987c8b3e6df064 Mon Sep 17 00:00:00 2001 From: damoklov Date: Sun, 4 Oct 2020 17:08:47 +0300 Subject: [PATCH 2/4] Added initial adaptation of yabin for Python3 --- yabin3.py | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 yabin3.py diff --git a/yabin3.py b/yabin3.py new file mode 100644 index 0000000..66560a0 --- /dev/null +++ b/yabin3.py @@ -0,0 +1,302 @@ +''' + + YaraBin (Yara + Binary) + + This generates Yara rules from function prologs, for matching and hunting + + Questions of comments? Hit me up @chrisdoman + +''' + +import binascii +import re +import os +import argparse +import math +import sqlite3 +import hashlib + +# What percent overlap required for two malware samples considered to be the same family? +# From 0 (0%) to 1 (100%). A large number means a tighter yara signature +# will be created +percent_tight_match = 0.8 + +seen_patterns = {} +conn = sqlite3.connect('db.db') +db = conn.cursor() +db.execute('PRAGMA synchronous=OFF') + + +def parseArguments(): + + parser = argparse.ArgumentParser( + description='Yabin - Signatures and searches malware') + parser.add_argument('-y', '--yara', help='Generate yara rule for the file or folder', required=False) + parser.add_argument('-yh', '--yaraHunt', help='Generate wide yara rule (any of, not all of).\r\n Useful for hunting for related samples or potentially malicious files that share any of the code - but liable to false positive', required=False) + parser.add_argument('-d', '--deleteDatabase',help='Empty the whitelist and malware database', action='store_true') + parser.add_argument('-w', '--addToWhitelist',help='Add a file or folder to the whitelist', required=False) + parser.add_argument('-f', '--fuzzyHash', help='Generate a fuzzy hash for the file', required=False) + parser.add_argument('-m', '--malwareAdd', help='Add malware file or folder to malware database to be searched', required=False) + parser.add_argument('-s', '--malwareSearch', help='Search for samples related to this file', required=False) + + args = vars(parser.parse_args()) + + if args['yara']: + yara(args['yara']) + if args['yaraHunt']: + yara(args['yaraHunt'], False) + if args['deleteDatabase']: + deleteDatabase() + if args['addToWhitelist']: + addToWhitelist(args['addToWhitelist']) + if args['fuzzyHash']: + fuzzyHash(args['fuzzyHash']) + if args['malwareAdd']: + addMalware(args['malwareAdd']) + if args['malwareSearch']: + malwareSearch(args['malwareSearch']) + + +def getBytePatterns(filename, ignore_whitelist=False): + with open(filename, 'rb') as f: + content = f.read() + hex = binascii.hexlify(content).decode('utf-8') + # Add - every two characters so we match -xx- not x-x + hex = 'x'.join([hex[i:i + 2] for i in range(0, len(hex), 2)]) + seen = {} + for match in re.findall(prolog_regex, hex): + bit = match[0].replace('x', '') + if bit not in seen: + if ignore_whitelist or not whitelisted(bit): + # Only include high entropy patterns, ie) avoid 0000000 or + # 1111111 etc. + # if entropy(bit) > 0: + seen[bit] = entropy(bit) + + return seen + + +def loadProlog(): + prolog_regex = '(' + with open('regex.txt') as file: + for l in file.readlines(): + line = l.strip() + if not line.startswith('#'): + if len(line) > 3: + prolog_regex += line + '|' + prolog_regex += ')' + prolog_regex = prolog_regex.replace('|)', ')') + return prolog_regex + +# Get the shannon entropy of a string + + +def entropy(string): + prob = [float(string.count(c)) / len(string) + for c in dict.fromkeys(list(string))] + entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob]) + return entropy + + +def generateFuzzyHash(filename): + # Print out those that aren't in the whitelist + byte_patterns = getBytePatterns(filename) + patterns = [] + for s in byte_patterns: + patterns.append(s) + + patterns.sort() + # Just print the first sorted pattern... a vey poor mans fuzzy hash + for s in patterns: + print(filename + ',' + s) + return + + +def generateYara(filename, singleFile, tight=True, max_lines=3000, min_patterns=0): + global seen_patterns + global percent_tight_match + + # Print out those that aren't in the whitelist + byte_patterns = getBytePatterns(filename) + + if tight: + # Dont print the same rule twice + if str(byte_patterns) not in seen_patterns: + seen_patterns[str(byte_patterns)] = 1 + # If we have no, or only one pattern, it probably won't be a tight + # enough signature + if len(byte_patterns) > min_patterns: + print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {') + print(' strings:') + + count = 1 + for s in byte_patterns: + if count < max_lines: + count += 1 + print(' $a_' + str(count) + ' = { ' + s + ' }') + + print(' condition:') + tight_decimal = int(round(count * percent_tight_match)) + print(' ' + str(tight_decimal) + ' of them') + print('}') + print('\r\n\r\n') + + if not tight: + # Dont print the same rule twice + if str(byte_patterns) not in seen_patterns: + seen_patterns[str(byte_patterns)] = 1 + # If we have no, or only one pattern, it probably won't be a tight + # enough signature + if len(byte_patterns) > min_patterns: + print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {') + print(' strings:') + + count = 1 + for s in byte_patterns: + if count < max_lines: + count += 1 + print(' $a_' + str(count) + ' = { ' + s + ' }') + + print(' condition:') + tight_decimal = int(round(count * percent_tight_match)) + print(' any of them') + print('}') + print('\r\n\r\n') + + +def fuzzyHash(filename, tight=True): + if os.path.isdir(filename): + for f in os.listdir(filename): + generateFuzzyHash('./' + filename + '/' + f) + else: + if os.path.isfile(filename): + generateFuzzyHash(filename) + + +def yara(filename, tight=True): + if os.path.isdir(filename): + for f in os.listdir(filename): + generateYara('./' + filename + '/' + f, False, tight) + else: + if os.path.isfile(filename): + generateYara(filename, True, tight) + +# Returns true if a pattern is whitelisted + + +def whitelisted(pattern): + db.execute('SELECT * FROM whitelist WHERE pattern ="' + pattern + '"') + result = db.fetchone() + if result is None: + return False + return True + + +def addToWhitelist(folder): + # Minimum number of samples a pattern must be in + min_seen = 1 + count = 0 + + # If we dont care how often it's been seen, just insert it + if min_seen == 0: + for f in os.listdir(folder): + count = count + 1 + print('Processed ' + str(count) + ' file(s)') + print('Processing ' + f) + new_seen = getBytePatterns('./' + folder + '/' + f, True) + for pattern in new_seen: + db.execute( + 'insert or ignore into whitelist (pattern) values ("' + pattern + '")') + conn.commit() + + # Otherwise actually keep track of how many samples a pattern has been in + else: + seen = {} + # Built a count of how often every pattern was seen + for f in os.listdir(folder): + count = count + 1 + print('Processed ' + str(count) + ' file(s)') + new_seen = getBytePatterns('./' + folder + '/' + f, True) + for pattern in new_seen: + if pattern not in seen: + seen[pattern] = 1 + else: + seen[pattern] = seen[pattern] + 1 + + total = 0 + # Insert every pattern seen > x times into the whtelist + for pattern, count in seen.items(): + if count > min_seen: + total = total + 1 + db.execute('insert or ignore into whitelist (pattern) values ("' + pattern + '")') + + conn.commit() + + +def generateSample(filename): + md5 = hashlib.md5(open(filename, 'rb').read()).hexdigest() + # Print out those that aren't in the whitelist + byte_patterns = getBytePatterns(filename) + for pattern in byte_patterns: + db.execute('insert or ignore into malware (pattern, md5) values ("' + + pattern + '", "' + md5 + '")') + + +def deleteDatabase(): + db.execute('DROP TABLE IF EXISTS whitelist') + db.execute('DROP TABLE IF EXISTS malware') + db.execute('CREATE TABLE whitelist (pattern text)') + db.execute('CREATE UNIQUE INDEX whitelist_index on whitelist (pattern)') + db.execute('CREATE TABLE malware (pattern text, md5 text)') + db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5)') + + +# Add a file or folder to malware db +def addMalware(filename): + print('Adding samples to malware database') + if os.path.isdir(filename): + for f in os.listdir(filename): + generateSample('./' + filename + '/' + f) + else: + if os.path.isfile(filename): + generateSample(filename) + conn.commit() + print('Added samples') + + +# For every pattern in file, find related +def malwareSearch(filename): + md5 = hashlib.md5(open(filename,'rb').read()).hexdigest() + pattern_lookups = {} + found_samples = set() + + # Print out those that aren't in the whitelist + byte_patterns = getBytePatterns(filename) + for pattern in byte_patterns: + related_samples = findRelated(pattern) + + for sample in related_samples: + if sample not in found_samples and sample != md5: + found_samples.add(sample) + pattern_lookups[sample] = pattern + + if len(found_samples) > 0: + print('Found related samples:') + for sample in found_samples: + print(sample + ' matched via ' + pattern_lookups[sample]) + else: + print('No related samples found') + +def findRelated(pattern): + db.execute('SELECT md5 FROM malware WHERE pattern ="' + pattern + '"') + rows = db.fetchall() + toReturn = [] + for row in rows: + toReturn.append(row[0]) + + return toReturn + +# This regex decides what patterns we will extract +prolog_regex = loadProlog() + +parseArguments() From 483d55fd8e48c368f33c18d5afa3fe3c4306cf4f Mon Sep 17 00:00:00 2001 From: damoklov Date: Fri, 9 Oct 2020 14:38:23 +0300 Subject: [PATCH 3/4] Added .gitignore --- .gitignore | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4bf2a22..e06f756 100644 --- a/.gitignore +++ b/.gitignore @@ -138,4 +138,14 @@ dmypy.json cython_debug/ # PyCharm -.idea/ \ No newline at end of file +.idea/ + +# VSCode +.vscode/ + +# Database file +db.db + +# Directories with samples +clean/ +malware/ From 5257cea3da9b485b343ee9a5245a220dcddba968 Mon Sep 17 00:00:00 2001 From: damoklov Date: Fri, 9 Oct 2020 14:51:05 +0300 Subject: [PATCH 4/4] Updated entropy and database entries --- yabin3.py | 106 ++++++++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/yabin3.py b/yabin3.py index 66560a0..8758cdd 100644 --- a/yabin3.py +++ b/yabin3.py @@ -38,6 +38,7 @@ def parseArguments(): parser.add_argument('-f', '--fuzzyHash', help='Generate a fuzzy hash for the file', required=False) parser.add_argument('-m', '--malwareAdd', help='Add malware file or folder to malware database to be searched', required=False) parser.add_argument('-s', '--malwareSearch', help='Search for samples related to this file', required=False) + parser.add_argument('-mf', '--malwareFamily', help='Name of malware family to be added', required=False, action='store', type=str, const="no_family", nargs='?') args = vars(parser.parse_args()) @@ -52,7 +53,7 @@ def parseArguments(): if args['fuzzyHash']: fuzzyHash(args['fuzzyHash']) if args['malwareAdd']: - addMalware(args['malwareAdd']) + addMalware(args['malwareAdd'], args['malwareFamily']) if args['malwareSearch']: malwareSearch(args['malwareSearch']) @@ -63,16 +64,15 @@ def getBytePatterns(filename, ignore_whitelist=False): hex = binascii.hexlify(content).decode('utf-8') # Add - every two characters so we match -xx- not x-x hex = 'x'.join([hex[i:i + 2] for i in range(0, len(hex), 2)]) - seen = {} - for match in re.findall(prolog_regex, hex): + seen = {} # stores values {"opcode_sequence": float(entropy)} + for match in re.findall(prolog_regex, hex): # prolog_regex here is in form of OR statement (|||...) bit = match[0].replace('x', '') if bit not in seen: if ignore_whitelist or not whitelisted(bit): # Only include high entropy patterns, ie) avoid 0000000 or # 1111111 etc. - # if entropy(bit) > 0: - seen[bit] = entropy(bit) - + if entropy(bit) > 0: + seen[bit] = entropy(bit) return seen @@ -112,56 +112,39 @@ def generateFuzzyHash(filename): return -def generateYara(filename, singleFile, tight=True, max_lines=3000, min_patterns=0): +def generateYara(filename, singleFile, tight=True, max_lines=75, min_patterns=0): global seen_patterns global percent_tight_match # Print out those that aren't in the whitelist byte_patterns = getBytePatterns(filename) - if tight: + def form_rule(tight=True): # Dont print the same rule twice if str(byte_patterns) not in seen_patterns: seen_patterns[str(byte_patterns)] = 1 # If we have no, or only one pattern, it probably won't be a tight # enough signature - if len(byte_patterns) > min_patterns: - print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {') + if len(byte_patterns.keys()) > min_patterns: + print('rule tight_' + os.path.basename(filename).replace('/', '_').replace('.', '') + ' {') print(' strings:') count = 1 for s in byte_patterns: - if count < max_lines: - count += 1 + if count < max_lines and entropy(s) > 3.25: # added entropy here print(' $a_' + str(count) + ' = { ' + s + ' }') - - print(' condition:') - tight_decimal = int(round(count * percent_tight_match)) - print(' ' + str(tight_decimal) + ' of them') - print('}') - print('\r\n\r\n') - - if not tight: - # Dont print the same rule twice - if str(byte_patterns) not in seen_patterns: - seen_patterns[str(byte_patterns)] = 1 - # If we have no, or only one pattern, it probably won't be a tight - # enough signature - if len(byte_patterns) > min_patterns: - print('rule tight_' + filename.replace('/', '_').replace('.', '') + ' {') - print(' strings:') - - count = 1 - for s in byte_patterns: - if count < max_lines: count += 1 - print(' $a_' + str(count) + ' = { ' + s + ' }') print(' condition:') - tight_decimal = int(round(count * percent_tight_match)) - print(' any of them') + tight_decimal = int(math.floor(count * percent_tight_match)) + if tight: + print(' ' + str(tight_decimal) + ' of them') + else: + print(' any of them') print('}') print('\r\n\r\n') + + form_rule(tight=tight) def fuzzyHash(filename, tight=True): @@ -194,20 +177,23 @@ def whitelisted(pattern): def addToWhitelist(folder): # Minimum number of samples a pattern must be in - min_seen = 1 + min_seen = 0 count = 0 # If we dont care how often it's been seen, just insert it if min_seen == 0: for f in os.listdir(folder): count = count + 1 - print('Processed ' + str(count) + ' file(s)') - print('Processing ' + f) - new_seen = getBytePatterns('./' + folder + '/' + f, True) - for pattern in new_seen: - db.execute( - 'insert or ignore into whitelist (pattern) values ("' + pattern + '")') - conn.commit() + if int(os.path.getsize('./' + folder + '/' + f)) < 35000000: #bytes + print('Processed ' + str(count) + ' file(s)') + print('Processing ' + f) + new_seen = getBytePatterns('./' + folder + '/' + f, True) + for pattern in new_seen: + db.execute( + 'insert or ignore into whitelist (pattern) values ("' + pattern + '")') # will not insert similar patterns, ok + conn.commit() + else: + continue # Otherwise actually keep track of how many samples a pattern has been in else: @@ -215,13 +201,16 @@ def addToWhitelist(folder): # Built a count of how often every pattern was seen for f in os.listdir(folder): count = count + 1 - print('Processed ' + str(count) + ' file(s)') - new_seen = getBytePatterns('./' + folder + '/' + f, True) - for pattern in new_seen: - if pattern not in seen: - seen[pattern] = 1 - else: - seen[pattern] = seen[pattern] + 1 + if int(os.path.getsize('./' + folder + '/' + f)) < 35000000: #bytes + print('Processed ' + str(count) + ' file(s)') + new_seen = getBytePatterns('./' + folder + '/' + f, True) + for pattern in new_seen: + if pattern not in seen: + seen[pattern] = 1 + else: + seen[pattern] = seen[pattern] + 1 + else: + continue total = 0 # Insert every pattern seen > x times into the whtelist @@ -233,13 +222,13 @@ def addToWhitelist(folder): conn.commit() -def generateSample(filename): +def generateSample(filename, family): md5 = hashlib.md5(open(filename, 'rb').read()).hexdigest() # Print out those that aren't in the whitelist byte_patterns = getBytePatterns(filename) for pattern in byte_patterns: - db.execute('insert or ignore into malware (pattern, md5) values ("' + - pattern + '", "' + md5 + '")') + db.execute('insert or ignore into malware (pattern, md5, family) values ("' + + pattern + '", "' + md5 + '", "' + family + '")') def deleteDatabase(): @@ -247,19 +236,18 @@ def deleteDatabase(): db.execute('DROP TABLE IF EXISTS malware') db.execute('CREATE TABLE whitelist (pattern text)') db.execute('CREATE UNIQUE INDEX whitelist_index on whitelist (pattern)') - db.execute('CREATE TABLE malware (pattern text, md5 text)') - db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5)') - + db.execute('CREATE TABLE malware (pattern text, md5 text, family text)') + db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5, family)') # Add a file or folder to malware db -def addMalware(filename): +def addMalware(filename, family): print('Adding samples to malware database') if os.path.isdir(filename): for f in os.listdir(filename): - generateSample('./' + filename + '/' + f) + generateSample('./' + filename + '/' + f, family) else: if os.path.isfile(filename): - generateSample(filename) + generateSample(filename, family) conn.commit() print('Added samples')