Skip to content

Commit

Permalink
cortex: Use of 'schema' module for input validation
Browse files Browse the repository at this point in the history
Before all attributes of the incoming report were (not) checked individually.

Now the 'schema' module is a new dependency.
It checks the reports against a template provided in the respective analysers.
This template has to be implemented by the analyser.
  • Loading branch information
Jack28 committed Jul 22, 2021
1 parent f0481f5 commit cfd84d6
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 77 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ See documentation for details.
domain, hash, and ip artifacts from within the Generic rules.
- FileInfoAnalyzerReport has new attibutes md5sum, sha256sum, and ssdeepsum
(now don't get to excited, ssdeep hashes can only be used as strings)
- Input validation of reports adds a new pip requirement: schema

## 2.0

Expand Down
2 changes: 1 addition & 1 deletion docs/source/ruleset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ Attribues of cortexreport
CAPEv2FileReport.signatures
CAPEv2FileReport.malscore
and every cortexreport has these artifacts
and all analyser reports have these artifacts
.. code-block:: shell
Expand Down
137 changes: 99 additions & 38 deletions peekaboo/toolbox/cortex.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import cortex4py.api
import cortex4py.exceptions
import requests.sessions
import schema
import urllib3.util.retry

from peekaboo.exceptions import PeekabooException
Expand All @@ -56,19 +57,30 @@ def __init__(self, analyzer):

class CortexAnalyzerReport:
""" Cortex analyzer report base class. """
def __init__(self, report):
if report is None:
self.report = {}
report_schema = schema.Schema(None, error='Subclass needs to provide a schema')

report_schema_artifacts = [schema.Schema({
"data": str,
"dataType": str,
# Possible future extensions
# "message": schema.Schema(schema.Or(str, None)),
# "tags": list,
# "tlp": int
}, ignore_extra_keys=True)]

if not isinstance(report, dict):
raise TypeError('report is expected to be a dict')
def __init__(self, unvalidated_report):
if unvalidated_report is None:
unvalidated_report = {}

# validate the report against subclass attribute report_schema
self.report = self.report_schema.validate(unvalidated_report)

self._domain_artifacts = self.get_filtered_elements_from_list_of_dicts(
report.get('artifacts', []), 'dataType', 'domain', 'data', str)
self.report.get('artifacts', []), 'dataType', 'domain', 'data', str)
self._hash_artifacts = self.get_filtered_elements_from_list_of_dicts(
report.get('artifacts', []), 'dataType', 'hash', 'data', str)
self.report.get('artifacts', []), 'dataType', 'hash', 'data', str)
self._ip_artifacts = self.get_filtered_elements_from_list_of_dicts(
report.get('artifacts', []), 'dataType', 'ip', 'data', str)
self.report.get('artifacts', []), 'dataType', 'ip', 'data', str)

@classmethod
def get_element_from_list_of_dicts(cls, list_, ident_key, ident_value, default={}):
Expand Down Expand Up @@ -158,40 +170,67 @@ def get_submit_parameters(self, sample, submit_original_filename=False):
class FileInfoAnalyzerReport(CortexAnalyzerReport):
""" Represents a Cortex FileInfo_7_0 analysis JSON report. """

def __init__(self, report=None):
report_schema = schema.Schema({
"summary": {
"taxonomies": [schema.Schema({
"level": schema.Or("info", "malicious", "safe"),
"namespace": "FileInfo",
# "predicate": str,
# "value": str
}, ignore_extra_keys=True)]
},
"full": {
"results": [
{
"submodule_name": "Basic properties",
"results": [
{
"submodule_section_header": "Hashes",
"submodule_section_content": {
"md5": schema.Regex(r'^[0-9a-z]{32}$'),
"sha1": schema.Regex(r'^[0-9a-z]{40}$'),
"sha256": schema.Regex(r'^[0-9a-z]{64}$'),
"ssdeep": schema.Regex(r'^[0-9A-Za-z:]*$'),
}
},
{
# We consume further structures submodule_sections and
# explicitly check the submodule_section_header to not
# be "Hashes" or it will accept "Hashes"-structures with
# malfarmed hashes.
"submodule_section_header": schema.And(str, lambda s: s != "Hashes"),
"submodule_section_content": schema.Schema({
}, ignore_extra_keys=True)
},
],
"summary": {
"taxonomies": [schema.Schema({
"level": schema.Or("info", "malicious", "safe"),
"namespace": "FileInfo",
# "predicate": str,
# "value": str
}, ignore_extra_keys=True)]
}
}
]
},
"success": bool,
"artifacts": CortexAnalyzerReport.report_schema_artifacts,
"operations": []
})

def __init__(self, unvalidated_report=None):
"""
@param report: hash with report data from Cortex FileInfo Analyzer
"""
super().__init__(report)
super().__init__(unvalidated_report)

basic_properties = self.get_element_from_list_of_dicts(
report.get('full', []).get('results', {}),
self.report.get('full', []).get('results', {}),
'submodule_name', 'Basic properties').get('results', [])
self._hashes = self.get_element_from_list_of_dicts(
basic_properties, 'submodule_section_header', 'Hashes').get(
'submodule_section_content', {})
if not isinstance(self._hashes, dict):
raise TypeError('hashes are expected to be a dict')

sha256sum = self._hashes.get('sha256', '')
if not isinstance(sha256sum, str):
raise TypeError('sha256 sum is expected to be a string')
if len(sha256sum) != 64:
raise TypeError('sha256 sum string is expected '
'to be 64 characters long')

md5sum = self._hashes.get('md5', '')
if not isinstance(md5sum, str):
raise TypeError('md5 sum is expected to be a string')
if len(md5sum) != 32:
raise TypeError('md5 sum string is expected to be 32 characters long')

ssdeepsum = self._hashes.get('ssdeep', '')
if not isinstance(ssdeepsum, str):
raise TypeError('ssdeep sum is expected to be a string')
if len(ssdeepsum) > 148:
raise TypeError('ssdeep sum string is expected to '
'be less or equal to 148 characters long')

@property
def sha256sum(self):
Expand Down Expand Up @@ -233,10 +272,32 @@ class HybridAnalysis(CortexFileAnalyzer):

class VirusTotalQueryReport(CortexAnalyzerReport):
""" Represents a Cortex VirusTotal_GetReport_3_0 analysis JSON report. """
def __init__(self, report):
super().__init__(report)
report_schema = schema.Schema({
"summary": {
"taxonomies": [
{
"level": schema.Or("info", "malicious", "safe"),
"namespace": "VT",
"predicate": str,
"value": schema.Regex(r'^[0-9/]*$')
}
]
},
"full": {
"response_code": int,
"resource": str,
"verbose_msg": str
},
"success": bool,
"artifacts": CortexAnalyzerReport.report_schema_artifacts,
"operations": []
})

def __init__(self, unvalidated_report):
super().__init__(unvalidated_report)

self.taxonomies_vt = self.get_element_from_list_of_dicts(
report.get('summary', {}).get('taxonomies'),
self.report.get('summary', {}).get('taxonomies'),
'namespace', 'VT', {}
)

Expand Down Expand Up @@ -607,8 +668,8 @@ def resubmit_with_analyzer_report(self, job_id):
# mark analysis as failed if we could not get the report e.g.
# because it was corrupted or the API connection failed.
job.sample.mark_cortex_failure()
except TypeError as error:
logger.warning('Report returned from Cortex conainted '
except schema.SchemaError as error:
logger.warning('Report returned from Cortex contained '
'invalid data: %s', error)
job.sample.mark_cortex_failure()

Expand Down
118 changes: 80 additions & 38 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
import unittest
from datetime import datetime, timedelta

import schema


# Add Peekaboo to PYTHONPATH
TESTSDIR = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -1502,51 +1504,88 @@ def test_rule_expressions_cortexreport_fileinfoanalyzerreport_context(self):
sample = factory.make_sample(os.path.join(
self.office_data_dir, 'blank.doc'))

taxonomies = [
{
"level": "info",
"namespace": "FileInfo",
"predicate": "Filetype",
"value": "JPEG"
}
]

report = {
"summary": {},
"summary": {
"taxonomies": taxonomies
},
"full": {
"results": [{
"submodule_name": "Basic properties",
"results": [
{
"submodule_section_header": "Hashes",
"submodule_section_content": {
"md5": "78576e618aff135f320601e49bd8fe7e",
"sha256": "A"*64,
}
},
]
}]
"results": [
{
"submodule_name": "Basic properties",
"results": [
{
"submodule_section_header": "Hashes",
"submodule_section_content": {
"md5": "78576e618aff135f320601e49bd8fe7e",
"sha1": "2520dcd603b851846fa27035807adc4df83a7519",
"sha256": "42690cc82dd1f56fd6ec315723b8e1f27fdd42e670c7752477e91afb62ea2c6b",
"ssdeep": "768:GaqFVZh8KI4mF0xJcXmwE6ONpHOhbPOobiSp3ug06GnzAjUaq:NK3bI4s0xJCmwE7HEbf7ozf"
}
},
],
"summary": {
"taxonomies": taxonomies
}
}
]
},
"success": False,
"artifacts": [],
}

artifact = {
"data": "8.8.8.8",
"dataType": "ip",
}
"success": True,
"artifacts": [
{
"data": "42690cc82dd1f56fd6ec315723b8e1f27fdd42e670c7752477e91afb62ea2c6b",
"dataType": "hash",
"message": None,
"tags": [],
"tlp": 2
},
{
"data": "2520dcd603b851846fa27035807adc4df83a7519",
"dataType": "hash",
"message": None,
"tags": [],
"tlp": 2
},
{
"data": "78576e618aff135f320601e49bd8fe7e",
"dataType": "hash",
"message": None,
"tags": [],
"tlp": 2
}
],
"operations": []
}

cortexreport = CortexReport()
cortexreport.register_report(FileInfoAnalyzer, report)
sample.register_cortex_report(cortexreport)
result = rule.evaluate(sample)
self.assertEqual(result.result, Result.good)

report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = "A"*32
report["artifacts"] = [artifact]
report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = "a"*32
report["artifacts"] = [{
"data": "8.8.8.8",
"dataType": "ip",
}]
cortexreport = CortexReport()
cortexreport.register_report(FileInfoAnalyzer, report)
sample.register_cortex_report(cortexreport)
result = rule.evaluate(sample)
self.assertEqual(result.result, Result.bad)

artifact = {
report["artifacts"] = [{
"data": "oh.my.day.um",
"dataType": "domain",
}

report["artifacts"] = [artifact]
}]
cortexreport = CortexReport()
cortexreport.register_report(FileInfoAnalyzer, report)
sample.register_cortex_report(cortexreport)
Expand All @@ -1555,7 +1594,7 @@ def test_rule_expressions_cortexreport_fileinfoanalyzerreport_context(self):

report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = ""
cortexreport = CortexReport()
with self.assertRaisesRegex(TypeError, r'md5 .* long'):
with self.assertRaises(schema.SchemaError):
cortexreport.register_report(FileInfoAnalyzer, report)

def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
Expand All @@ -1575,17 +1614,22 @@ def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
self.office_data_dir, 'blank.doc'))

tax = {
"level": "malicious",
"namespace": "VT",
"predicate": "GetReport",
"value": "37/68"
}
"level": "malicious",
"namespace": "VT",
"predicate": "GetReport",
"value": "37/68"
}
report = {
"summary": {
"taxonomies": [
tax
tax
]
},
"full": {
"response_code": 0,
"resource": "Foo",
"verbose_msg": "AAA"
},
"success": True,
"artifacts": [],
"operations": []
Expand All @@ -1612,10 +1656,8 @@ def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
self.assertEqual(result.result, Result.unknown)

report["summary"]["taxonomies"][0]["value"] = "NAN"
cortexreport.register_report(VirusTotalQuery, report)
sample.register_cortex_report(cortexreport)
with self.assertRaises(ValueError):
result = rule.evaluate(sample)
with self.assertRaises(schema.SchemaError):
cortexreport.register_report(VirusTotalQuery, report)

def test_rule_expressions_olereport_context(self):
""" Test generic rule olereport context """
Expand Down

0 comments on commit cfd84d6

Please sign in to comment.