From 9e86dceec9cf3a8870c7056aa56c7eaa0f947d51 Mon Sep 17 00:00:00 2001
From: Sebastian Wagner <wagner@cert.at>
Date: Thu, 29 Apr 2021 14:46:06 +0200
Subject: [PATCH 1/2] ENH: harmonization: create and use other/malware

the classification
malicious code / malware
is replaced by
malcious code / infected system
malcious code / malware-distribution
other / malware for all other cases, i.e. malware itself

fixes certtools/intelmq#1752
related to certtools/intelmq#1409
---
 NEWS.md                                           | 15 ++++++++++++++-
 docs/dev/data-format.rst                          |  2 +-
 intelmq/bots/experts/idea/expert.py               |  1 +
 intelmq/bots/experts/taxonomy/expert.py           |  1 +
 intelmq/bots/parsers/github_feed/parser.py        |  4 ++--
 intelmq/lib/harmonization.py                      |  6 ++++--
 .../tests/bots/parsers/github_feed/test_parser.py | 10 +++-------
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d4184840a..164694abe 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -30,16 +30,24 @@ The classification scheme has been updated to better match the [Reference Securi
 | malicious code               |                    | malicious-code               |                                        |
 | malicious code               | c2server           | malicious-code               | c2-server                              |
 | malicious code               | malware            | malicious-code               | infected-system / malware-distribution |
+| malicious code               | malware            | other                        | malware                                |
 | malicious code               | ransomware         | malicious-code               | infected-system                        |
 | vulnerable                   | vulnerable client  | vulnerable                   | vulnerable-system                      |
 | vulnerable                   | vulnerable service | vulnerable                   | vulnerable-system                      |
 | other                        | unknown            | other                        | undetermined                           |
 
 - For the taxonomy 'availability', the type `misconfiguration` is new.
-- For the taxonomy 'other', the type `undetermined` is new.
+- For the taxonomy 'other', the types `malware` and `undetermined` are new.
 
 The old names can still be used in code, and they are automatically converted to the new names.
 
+#### "Malware"
+
+The previously existing classification type "malware" under the taxonomy "malicious code" was removed, as this type does not exist in the RSIT.
+Most of the usages were wrong anyway, and should have been infected-device, malware-distribution or something else anyway.
+There is only one usage in IntelMQ, which can not be changed.
+And that one is really about malware itself (or: the hashes of samples). For this purpose, the new type "malware" under the taxonomy "other" was created, *slightly* deviating from the RSIT in this respect, but "other" can be freely extended.
+
 ### Configuration
 
 The `defaults.conf` file was removed. Settings that should effect all the bots are not part of the runtime.conf file and are configured in the `global` section in that file.
@@ -93,6 +101,11 @@ UPDATE events
    SET "classification.type" = 'malware-distribution'
    WHERE "classification.taxonomy" = 'malicious-code' AND ("classification.type" = 'malware' OR "classification.type" = 'ransomware');
 ```
+or this:
+```sql
+UPDATE events
+   SET "classification.taxonomy" = 'other'
+   WHERE "classification.type" = 'malware';
 
 
 2.3.3 Bugfix release (unreleased)
diff --git a/docs/dev/data-format.rst b/docs/dev/data-format.rst
index fea5c6c71..088c9de7b 100644
--- a/docs/dev/data-format.rst
+++ b/docs/dev/data-format.rst
@@ -135,13 +135,13 @@ The taxonomy can be automatically added by the taxonomy expert bot based on the
    malicious-code                c2-server                                   This is a command and control server in charge of a given number of botnet drones.
    malicious-code                dga domain                                  DGA Domains are seen various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers. Not in ENISA eCSIRT-II taxonomy.
    malicious-code                infected-system                             This is a compromised machine, which has been observed to make a connection to a command and control server.
-   malicious-code                malware                                     A URL is the most common resource with reference to malware binary distribution. Not in ENISA eCSIRT-II taxonomy.
    malicious-code                malware-configuration                       This is a resource which updates botnet drones with a new configuration.
    malicious-code                malware-distribution                        URI used for malware distribution, e.g. a download URL included in fake invoice malware spam.
    other                         blacklist                                   Some sources provide blacklists, which clearly refer to abusive behavior, such as spamming, but fail to denote the exact reason why a given identity has been blacklisted. The reason may be that the justification is anecdotal or missing entirely. This type should only be used if the typing fits the definition of a blacklist, but an event specific denomination is not possible for one reason or another.
    other                         other                                       All incidents which don't fit in one of the given categories should be put into this class.
    other                         proxy                                       This refers to the use of proxies from inside your network. Not in ENISA eCSIRT-II taxonomy.
    other                         tor                                         This IOC refers to incidents related to TOR network infrastructure. Not in ENISA eCSIRT-II taxonomy.
+   other                         malware                                     An IoC referring to a malware (sample) itself. Not in RSIT.
    other                         undetermined                                The categorisation of the incident is unknown/undetermined.
    test                          test                                        Meant for testing.
    vulnerable                    ddos-amplifier                              Publicly accessible services that can be abused for conducting DDoS reflection/amplification attacks, e.g. DNS open-resolvers or NTP servers with monlist enabled.
diff --git a/intelmq/bots/experts/idea/expert.py b/intelmq/bots/experts/idea/expert.py
index 4e7b4912c..5ccfd4e66 100644
--- a/intelmq/bots/experts/idea/expert.py
+++ b/intelmq/bots/experts/idea/expert.py
@@ -76,6 +76,7 @@ class IdeaExpertBot(Bot):
         "copyright": "Fraud.Copyright",
         "misconfiguration": "Availability.Outage",  # outage includes human error
         "undetermined": "Other",
+        "malware": "Malware",
     }
 
     TYPE_TO_SOURCE_TYPE = {
diff --git a/intelmq/bots/experts/taxonomy/expert.py b/intelmq/bots/experts/taxonomy/expert.py
index b3daa3649..d6f2ded83 100644
--- a/intelmq/bots/experts/taxonomy/expert.py
+++ b/intelmq/bots/experts/taxonomy/expert.py
@@ -55,6 +55,7 @@
     "undetermined": "other",
     "proxy": "other",  # not in ENISA eCSIRT-II taxonomy
     "tor": "other",  # not in ENISA eCSIRT-II taxonomy
+    "malware": "other",  # intentionally not in RSIT
     "test": "test",
     "ddos-amplifier": "vulnerable",
     "information-disclosure": "vulnerable",
diff --git a/intelmq/bots/parsers/github_feed/parser.py b/intelmq/bots/parsers/github_feed/parser.py
index c29781f2a..5843ae2c5 100644
--- a/intelmq/bots/parsers/github_feed/parser.py
+++ b/intelmq/bots/parsers/github_feed/parser.py
@@ -79,7 +79,7 @@ class Next(Exception):
             for ioc in json_content:
                 event = clean_event.copy()
                 event.add('raw', str(ioc))
-                event.add('classification.type', 'unknown')
+                event.add('classification.type', 'malware')
                 event.add('classification.taxonomy', 'other')
                 event.add('event_description.text', ioc['Description'])
 
@@ -129,7 +129,7 @@ def parse_domain_indicator(event, ioc_indicator: str):
 
 def parse_hash_indicator(event, ioc_indicator: str, hash_type: str):
     event.add('malware.hash.{}'.format(hash_type), ioc_indicator)
-    event.change('classification.taxonomy', 'malicious code')
+    event.change('classification.taxonomy', 'other')
     event.change('classification.type', 'malware')
     return event
 
diff --git a/intelmq/lib/harmonization.py b/intelmq/lib/harmonization.py
index ef24cd5e7..ea173b035 100644
--- a/intelmq/lib/harmonization.py
+++ b/intelmq/lib/harmonization.py
@@ -259,8 +259,9 @@ class ClassificationType(String):
         'ransomware' -> 'infected-system'
         'unknown' -> 'undetermined'
 
-    These old values can not be automatically mapped as they are ambiguous:
-        'malware': Either 'infected-system' or 'malware-distribution'
+    These values changed their taxonomy:
+        'malware': In terms of the taxonomy 'malicious-code' they can be either 'infected-system' or 'malware-distribution'
+            but in terms of malware actually, it is now taxonomy 'other'
 
     Allowed values are:
      * """
@@ -286,6 +287,7 @@ class ClassificationType(String):
                       'infected-system',
                       'information-disclosure',
                       'data-leak',
+                      'malware',
                       'malware-configuration',
                       'malware-distribution',
                       'masquerade',
diff --git a/intelmq/tests/bots/parsers/github_feed/test_parser.py b/intelmq/tests/bots/parsers/github_feed/test_parser.py
index 68b5cb0b6..f866d8cf2 100644
--- a/intelmq/tests/bots/parsers/github_feed/test_parser.py
+++ b/intelmq/tests/bots/parsers/github_feed/test_parser.py
@@ -23,7 +23,7 @@
     "feed.name": "Strangereal Intel DailyIOC",
     "time.observation": "2019-03-01T01:01:01+00:00",
     "classification.taxonomy": "other",
-    "classification.type": "unknown",
+    "classification.type": "malware",
     "__type": "Event"
 }
 
@@ -49,8 +49,6 @@ def test_no_processing_is_executed_for_the_feed_is_unknown(self):
 
         self.assertRegexpMatchesLog("Unknown feed '{}'.".format(wrong_report['feed.url']))
 
-    # https://github.com/certtools/intelmq/issues/1752
-    @unittest.expectedFailure
     def test_extra_fields_are_present_in_generated_event(self):
         custom_report = EXAMPLE_STRANGEREALINTEL_REPORT.copy()
         custom_report['extra.file_metadata'] = {
@@ -64,8 +62,6 @@ def test_extra_fields_are_present_in_generated_event(self):
         for event in self.get_output_queue():
             assert 'extra.file_metadata.sha' in event and 'extra.file_metadata.size' in event
 
-    # https://github.com/certtools/intelmq/issues/1752
-    @unittest.expectedFailure
     def test_strangerealintel_feed_processing_is_successful(self):
         self.run_bot()
 
@@ -74,7 +70,7 @@ def test_strangerealintel_feed_processing_is_successful(self):
         sha256_event = EXAMPLE_STRANGEREALINTEL_EVENT.copy()
         sha256_event['malware.hash.sha256'] = EXAMPLE_STRANGERINTEL_FILE_JSON[0]['Indicator']
         sha256_event['event_description.text'] = EXAMPLE_STRANGERINTEL_FILE_JSON[0]['Description']
-        sha256_event['classification.taxonomy'] = 'malicious code'
+        sha256_event['classification.taxonomy'] = 'other'
         sha256_event['classification.type'] = 'malware'
         sha256_event['raw'] = utils.base64_encode(str(EXAMPLE_STRANGERINTEL_FILE_JSON[0]))
         self.assertMessageEqual(0, sha256_event)
@@ -82,7 +78,7 @@ def test_strangerealintel_feed_processing_is_successful(self):
         md5_event = EXAMPLE_STRANGEREALINTEL_EVENT.copy()
         md5_event['malware.hash.md5'] = EXAMPLE_STRANGERINTEL_FILE_JSON[1]['Indicator']
         md5_event['event_description.text'] = EXAMPLE_STRANGERINTEL_FILE_JSON[1]['Description']
-        md5_event['classification.taxonomy'] = 'malicious code'
+        md5_event['classification.taxonomy'] = 'other'
         md5_event['classification.type'] = 'malware'
         md5_event['raw'] = utils.base64_encode(str(EXAMPLE_STRANGERINTEL_FILE_JSON[1]))
         self.assertMessageEqual(1, md5_event)

From 177dce8524355ea4cbf9b45e9acc5df783ab02ea Mon Sep 17 00:00:00 2001
From: Sebastian Wagner <wagner@cert.at>
Date: Thu, 29 Apr 2021 14:47:56 +0200
Subject: [PATCH 2/2] DOC: fixes for "other" classification

---
 NEWS.md                                 |  2 +-
 docs/dev/data-format.rst                | 12 +++++++-----
 intelmq/bots/experts/taxonomy/expert.py |  6 +++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 164694abe..296ce6e21 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,7 +18,7 @@ Both the XMPP collector bot and the XMPP output bot were removed. This [was eval
 and the XMPP bots were deprecated in 391d625.
 
 ### Harmonization
-The classification scheme has been updated to better match the [Reference Security Incident Taxonomy](https://github.com/enisaeu/Reference-Security-Incident-Taxonomy-Task-Force/). The following labels were renamed:
+The classification scheme has been updated to better match the [Reference Security Incident Taxonomy (RSIT)](https://github.com/enisaeu/Reference-Security-Incident-Taxonomy-Task-Force/). The following labels were renamed:
 
 | old taxonomy name | old type name | new taxonomy name | new type name |
 |-|-|-|-|
diff --git a/docs/dev/data-format.rst b/docs/dev/data-format.rst
index 088c9de7b..1e9554bfe 100644
--- a/docs/dev/data-format.rst
+++ b/docs/dev/data-format.rst
@@ -94,7 +94,7 @@ Classification
 
 IntelMQ classifies events using three labels: taxonomy, type and identifier. This tuple of three values can be used for deduplication of events and describes what happened.
 
-The taxonomy can be automatically added by the taxonomy expert bot based on the given type. The following classification scheme follow the `Reference Security Incident Taxonomy <https://github.com/enisaeu/Reference-Security-Incident-Taxonomy-Task-Force/>`_:
+The taxonomy can be automatically added by the taxonomy expert bot based on the given type. The following classification scheme follows the `Reference Security Incident Taxonomy (RSIT) <https://github.com/enisaeu/Reference-Security-Incident-Taxonomy-Task-Force/>`_:
 
 
 ===============================  ========================================= =============================================
@@ -137,13 +137,13 @@ The taxonomy can be automatically added by the taxonomy expert bot based on the
    malicious-code                infected-system                             This is a compromised machine, which has been observed to make a connection to a command and control server.
    malicious-code                malware-configuration                       This is a resource which updates botnet drones with a new configuration.
    malicious-code                malware-distribution                        URI used for malware distribution, e.g. a download URL included in fake invoice malware spam.
-   other                         blacklist                                   Some sources provide blacklists, which clearly refer to abusive behavior, such as spamming, but fail to denote the exact reason why a given identity has been blacklisted. The reason may be that the justification is anecdotal or missing entirely. This type should only be used if the typing fits the definition of a blacklist, but an event specific denomination is not possible for one reason or another.
+   other                         blacklist                                   Some sources provide blacklists, which clearly refer to abusive behavior, such as spamming, but fail to denote the exact reason why a given identity has been blacklisted. The reason may be that the justification is anecdotal or missing entirely. This type should only be used if the typing fits the definition of a blacklist, but an event specific denomination is not possible for one reason or another. Not in RSIT.
    other                         other                                       All incidents which don't fit in one of the given categories should be put into this class.
-   other                         proxy                                       This refers to the use of proxies from inside your network. Not in ENISA eCSIRT-II taxonomy.
-   other                         tor                                         This IOC refers to incidents related to TOR network infrastructure. Not in ENISA eCSIRT-II taxonomy.
    other                         malware                                     An IoC referring to a malware (sample) itself. Not in RSIT.
+   other                         proxy                                       This refers to the use of proxies from inside your network. Not in RSIT.
+   test                          test                                        Meant for testing. Not in RSIT.
+   other                         tor                                         This IOC refers to incidents related to TOR network infrastructure. Not in RSIT.
    other                         undetermined                                The categorisation of the incident is unknown/undetermined.
-   test                          test                                        Meant for testing.
    vulnerable                    ddos-amplifier                              Publicly accessible services that can be abused for conducting DDoS reflection/amplification attacks, e.g. DNS open-resolvers or NTP servers with monlist enabled.
    vulnerable                    information-disclosure                      Publicly accessible services potentially disclosing sensitive information, e.g. SNMP or Redis.
    vulnerable                    potentially-unwanted-accessible             Potentially unwanted publicly accessible services, e.g. Telnet, RDP or VNC.
@@ -151,6 +151,8 @@ The taxonomy can be automatically added by the taxonomy expert bot based on the
    vulnerable                    weak-crypto                                 Publicly accessible services offering weak crypto, e.g. web servers susceptible to POODLE/FREAK attacks.
 ===============================  ========================================= =============================================
 
+In the "other" taxonomy, several types are not in the RSIT, but this taxonomy is intentionally extensible.
+
 Meaning of source, destination and local values for each classification type and possible identifiers. The identifier is often a normalized malware name, grouping many variants.
 +Examples of the meaning of the *source* and *destination* fields for each classification type and possible identifiers are shown here. Usually the main information is in the *source* fields. The identifier is often a normalized malware name, grouping many variants.
 
diff --git a/intelmq/bots/experts/taxonomy/expert.py b/intelmq/bots/experts/taxonomy/expert.py
index d6f2ded83..3ac6af6bc 100644
--- a/intelmq/bots/experts/taxonomy/expert.py
+++ b/intelmq/bots/experts/taxonomy/expert.py
@@ -50,12 +50,12 @@
     "infected-system": "malicious-code",
     "malware-configuration": "malicious-code",
     "malware-distribution": "malicious-code",
-    "blacklist": "other",  # not in ENISA eCSIRT-II taxonomy
+    "blacklist": "other",  # intentionally not in RSIT
     "other": "other",
     "undetermined": "other",
-    "proxy": "other",  # not in ENISA eCSIRT-II taxonomy
-    "tor": "other",  # not in ENISA eCSIRT-II taxonomy
     "malware": "other",  # intentionally not in RSIT
+    "proxy": "other",  # intentionally not in RSIT
+    "tor": "other",  # intentionally not in RSIT
     "test": "test",
     "ddos-amplifier": "vulnerable",
     "information-disclosure": "vulnerable",