cortex: Use of 'schema' module for input validation

Before all attributes of the incoming report were (not) checked individually. Now the 'schema' module is a new dependency. It checks the reports against a template provided in the respective analysers. This template has to be implemented by the analyser.
scVENUS · Jul 22, 2021 · cfd84d6 · cfd84d6
1 parent f0481f5
commit cfd84d6
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 77 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ See documentation for details.
   domain, hash, and ip artifacts from within the Generic rules.
 - FileInfoAnalyzerReport has new attibutes md5sum, sha256sum, and ssdeepsum
   (now don't get to excited, ssdeep hashes can only be used as strings)
+- Input validation of reports adds a new pip requirement: schema
 
 ## 2.0
 

diff --git a/docs/source/ruleset.rst b/docs/source/ruleset.rst
@@ -151,7 +151,7 @@ Attribues of cortexreport
     CAPEv2FileReport.signatures
     CAPEv2FileReport.malscore
 
-and every cortexreport has these artifacts
+and all analyser reports have these artifacts
 
 .. code-block:: shell
 

diff --git a/peekaboo/toolbox/cortex.py b/peekaboo/toolbox/cortex.py
@@ -34,6 +34,7 @@
 import cortex4py.api
 import cortex4py.exceptions
 import requests.sessions
+import schema
 import urllib3.util.retry
 
 from peekaboo.exceptions import PeekabooException
@@ -56,19 +57,30 @@ def __init__(self, analyzer):
 
 class CortexAnalyzerReport:
     """ Cortex analyzer report base class. """
-    def __init__(self, report):
-        if report is None:
-            self.report = {}
+    report_schema = schema.Schema(None, error='Subclass needs to provide a schema')
+
+    report_schema_artifacts = [schema.Schema({
+        "data": str,
+        "dataType": str,
+    # Possible future extensions
+    #    "message": schema.Schema(schema.Or(str, None)),
+    #    "tags": list,
+    #    "tlp": int
+    }, ignore_extra_keys=True)]
 
-        if not isinstance(report, dict):
-            raise TypeError('report is expected to be a dict')
+    def __init__(self, unvalidated_report):
+        if unvalidated_report is None:
+            unvalidated_report = {}
+
+        # validate the report against subclass attribute report_schema
+        self.report = self.report_schema.validate(unvalidated_report)
 
         self._domain_artifacts = self.get_filtered_elements_from_list_of_dicts(
-                report.get('artifacts', []), 'dataType', 'domain', 'data', str)
+                self.report.get('artifacts', []), 'dataType', 'domain', 'data', str)
         self._hash_artifacts = self.get_filtered_elements_from_list_of_dicts(
-                report.get('artifacts', []), 'dataType', 'hash', 'data', str)
+                self.report.get('artifacts', []), 'dataType', 'hash', 'data', str)
         self._ip_artifacts = self.get_filtered_elements_from_list_of_dicts(
-                report.get('artifacts', []), 'dataType', 'ip', 'data', str)
+                self.report.get('artifacts', []), 'dataType', 'ip', 'data', str)
 
     @classmethod
     def get_element_from_list_of_dicts(cls, list_, ident_key, ident_value, default={}):
@@ -158,40 +170,67 @@ def get_submit_parameters(self, sample, submit_original_filename=False):
 class FileInfoAnalyzerReport(CortexAnalyzerReport):
     """ Represents a Cortex FileInfo_7_0 analysis JSON report. """
 
-    def __init__(self, report=None):
+    report_schema = schema.Schema({
+        "summary": {
+            "taxonomies": [schema.Schema({
+                "level": schema.Or("info", "malicious", "safe"),
+                "namespace": "FileInfo",
+            #    "predicate": str,
+            #    "value": str
+            }, ignore_extra_keys=True)]
+        },
+        "full": {
+            "results": [
+            {
+                "submodule_name": "Basic properties",
+                "results": [
+                    {
+                        "submodule_section_header": "Hashes",
+                        "submodule_section_content": {
+                            "md5": schema.Regex(r'^[0-9a-z]{32}$'),
+                            "sha1": schema.Regex(r'^[0-9a-z]{40}$'),
+                            "sha256": schema.Regex(r'^[0-9a-z]{64}$'),
+                            "ssdeep": schema.Regex(r'^[0-9A-Za-z:]*$'),
+                        }
+                    },
+                    {
+                        # We consume further structures submodule_sections and
+                        # explicitly check the submodule_section_header to not
+                        # be "Hashes" or it will accept "Hashes"-structures with
+                        # malfarmed hashes.
+                        "submodule_section_header": schema.And(str, lambda s: s != "Hashes"),
+                        "submodule_section_content": schema.Schema({
+                            }, ignore_extra_keys=True)
+                    },
+                ],
+                "summary": {
+                    "taxonomies": [schema.Schema({
+                        "level": schema.Or("info", "malicious", "safe"),
+                        "namespace": "FileInfo",
+                    #    "predicate": str,
+                    #    "value": str
+                    }, ignore_extra_keys=True)]
+                }
+            }
+            ]
+        },
+        "success": bool,
+        "artifacts": CortexAnalyzerReport.report_schema_artifacts,
+        "operations": []
+    })
+
+    def __init__(self, unvalidated_report=None):
         """
         @param report: hash with report data from Cortex FileInfo Analyzer
         """
-        super().__init__(report)
+        super().__init__(unvalidated_report)
 
         basic_properties = self.get_element_from_list_of_dicts(
-                report.get('full', []).get('results', {}),
+                self.report.get('full', []).get('results', {}),
                 'submodule_name', 'Basic properties').get('results', [])
         self._hashes = self.get_element_from_list_of_dicts(
                 basic_properties, 'submodule_section_header', 'Hashes').get(
                     'submodule_section_content', {})
-        if not isinstance(self._hashes, dict):
-            raise TypeError('hashes are expected to be a dict')
-
-        sha256sum = self._hashes.get('sha256', '')
-        if not isinstance(sha256sum, str):
-            raise TypeError('sha256 sum is expected to be a string')
-        if len(sha256sum) != 64:
-            raise TypeError('sha256 sum string is expected '
-                            'to be 64 characters long')
-
-        md5sum = self._hashes.get('md5', '')
-        if not isinstance(md5sum, str):
-            raise TypeError('md5 sum is expected to be a string')
-        if len(md5sum) != 32:
-            raise TypeError('md5 sum string is expected to be 32 characters long')
-
-        ssdeepsum = self._hashes.get('ssdeep', '')
-        if not isinstance(ssdeepsum, str):
-            raise TypeError('ssdeep sum is expected to be a string')
-        if len(ssdeepsum) > 148:
-            raise TypeError('ssdeep sum string is expected to '
-                            'be less or equal to 148 characters long')
 
     @property
     def sha256sum(self):
@@ -233,10 +272,32 @@ class HybridAnalysis(CortexFileAnalyzer):
 
 class VirusTotalQueryReport(CortexAnalyzerReport):
     """ Represents a Cortex VirusTotal_GetReport_3_0 analysis JSON report. """
-    def __init__(self, report):
-        super().__init__(report)
+    report_schema = schema.Schema({
+        "summary": {
+            "taxonomies": [
+                {
+                    "level": schema.Or("info", "malicious", "safe"),
+                    "namespace": "VT",
+                    "predicate": str,
+                    "value": schema.Regex(r'^[0-9/]*$')
+                }
+            ]
+        },
+        "full": {
+            "response_code": int,
+            "resource": str,
+            "verbose_msg": str
+        },
+        "success": bool,
+        "artifacts": CortexAnalyzerReport.report_schema_artifacts,
+        "operations": []
+    })
+
+    def __init__(self, unvalidated_report):
+        super().__init__(unvalidated_report)
+
         self.taxonomies_vt = self.get_element_from_list_of_dicts(
-                report.get('summary', {}).get('taxonomies'),
+                self.report.get('summary', {}).get('taxonomies'),
                 'namespace', 'VT', {}
             )
 
@@ -607,8 +668,8 @@ def resubmit_with_analyzer_report(self, job_id):
             # mark analysis as failed if we could not get the report e.g.
             # because it was corrupted or the API connection failed.
             job.sample.mark_cortex_failure()
-        except TypeError as error:
-            logger.warning('Report returned from Cortex conainted '
+        except schema.SchemaError as error:
+            logger.warning('Report returned from Cortex contained '
                            'invalid data: %s', error)
             job.sample.mark_cortex_failure()
 

diff --git a/tests/test.py b/tests/test.py
@@ -36,6 +36,8 @@
 import unittest
 from datetime import datetime, timedelta
 
+import schema
+
 
 # Add Peekaboo to PYTHONPATH
 TESTSDIR = os.path.dirname(os.path.abspath(__file__))
@@ -1502,51 +1504,88 @@ def test_rule_expressions_cortexreport_fileinfoanalyzerreport_context(self):
         sample = factory.make_sample(os.path.join(
             self.office_data_dir, 'blank.doc'))
 
+        taxonomies = [
+            {
+            "level": "info",
+            "namespace": "FileInfo",
+            "predicate": "Filetype",
+            "value": "JPEG"
+            }
+        ]
+
         report = {
-            "summary": {},
+            "summary": {
+                "taxonomies": taxonomies
+            },
             "full": {
-                "results": [{
-                    "submodule_name": "Basic properties",
-                    "results": [
-                        {
-                            "submodule_section_header": "Hashes",
-                            "submodule_section_content": {
-                                "md5": "78576e618aff135f320601e49bd8fe7e",
-                                "sha256": "A"*64,
-                            }
-                        },
-                    ]
-                }]
+                "results": [
+                    {
+                        "submodule_name": "Basic properties",
+                        "results": [
+                            {
+                                "submodule_section_header": "Hashes",
+                                "submodule_section_content": {
+                                    "md5": "78576e618aff135f320601e49bd8fe7e",
+                                    "sha1": "2520dcd603b851846fa27035807adc4df83a7519",
+                                    "sha256": "42690cc82dd1f56fd6ec315723b8e1f27fdd42e670c7752477e91afb62ea2c6b",
+                                    "ssdeep": "768:GaqFVZh8KI4mF0xJcXmwE6ONpHOhbPOobiSp3ug06GnzAjUaq:NK3bI4s0xJCmwE7HEbf7ozf"
+                                }
+                            },
+                        ],
+                        "summary": {
+                            "taxonomies": taxonomies
+                        }
+                    }
+                ]
             },
-            "success": False,
-            "artifacts": [],
-        }
-
-        artifact = {
-            "data": "8.8.8.8",
-            "dataType": "ip",
-        }
+            "success": True,
+            "artifacts": [
+                {
+                "data": "42690cc82dd1f56fd6ec315723b8e1f27fdd42e670c7752477e91afb62ea2c6b",
+                "dataType": "hash",
+                "message": None,
+                "tags": [],
+                "tlp": 2
+                },
+                {
+                "data": "2520dcd603b851846fa27035807adc4df83a7519",
+                "dataType": "hash",
+                "message": None,
+                "tags": [],
+                "tlp": 2
+                },
+                {
+                "data": "78576e618aff135f320601e49bd8fe7e",
+                "dataType": "hash",
+                "message": None,
+                "tags": [],
+                "tlp": 2
+                }
+            ],
+            "operations": []
+            }
 
         cortexreport = CortexReport()
         cortexreport.register_report(FileInfoAnalyzer, report)
         sample.register_cortex_report(cortexreport)
         result = rule.evaluate(sample)
         self.assertEqual(result.result, Result.good)
 
-        report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = "A"*32
-        report["artifacts"] = [artifact]
+        report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = "a"*32
+        report["artifacts"] = [{
+            "data": "8.8.8.8",
+            "dataType": "ip",
+        }]
         cortexreport = CortexReport()
         cortexreport.register_report(FileInfoAnalyzer, report)
         sample.register_cortex_report(cortexreport)
         result = rule.evaluate(sample)
         self.assertEqual(result.result, Result.bad)
 
-        artifact = {
+        report["artifacts"] = [{
             "data": "oh.my.day.um",
             "dataType": "domain",
-        }
-
-        report["artifacts"] = [artifact]
+        }]
         cortexreport = CortexReport()
         cortexreport.register_report(FileInfoAnalyzer, report)
         sample.register_cortex_report(cortexreport)
@@ -1555,7 +1594,7 @@ def test_rule_expressions_cortexreport_fileinfoanalyzerreport_context(self):
 
         report["full"]["results"][0]["results"][0]["submodule_section_content"]["md5"] = ""
         cortexreport = CortexReport()
-        with self.assertRaisesRegex(TypeError, r'md5 .* long'):
+        with self.assertRaises(schema.SchemaError):
             cortexreport.register_report(FileInfoAnalyzer, report)
 
     def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
@@ -1575,17 +1614,22 @@ def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
             self.office_data_dir, 'blank.doc'))
 
         tax = {
-                "level": "malicious",
-                "namespace": "VT",
-                "predicate": "GetReport",
-                "value": "37/68"
-            }
+            "level": "malicious",
+            "namespace": "VT",
+            "predicate": "GetReport",
+            "value": "37/68"
+        }
         report = {
             "summary": {
                 "taxonomies": [
-                tax
+                     tax
                 ]
             },
+            "full": {
+                "response_code": 0,
+                "resource": "Foo",
+                "verbose_msg": "AAA"
+            },
             "success": True,
             "artifacts": [],
             "operations": []
@@ -1612,10 +1656,8 @@ def test_rule_expressions_cortexreport_virustotalqueryreport_context(self):
         self.assertEqual(result.result, Result.unknown)
 
         report["summary"]["taxonomies"][0]["value"] = "NAN"
-        cortexreport.register_report(VirusTotalQuery, report)
-        sample.register_cortex_report(cortexreport)
-        with self.assertRaises(ValueError):
-            result = rule.evaluate(sample)
+        with self.assertRaises(schema.SchemaError):
+            cortexreport.register_report(VirusTotalQuery, report)
 
     def test_rule_expressions_olereport_context(self):
         """ Test generic rule olereport context """