DataDog · github-actions · Nov 29, 2024
@@ -1,4 +1,5 @@
 import re
+import string
 
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
@@ -16,7 +17,15 @@
 
 log = get_logger(__name__)
 
-REDACTED_SOURCE_BUFFER = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+REDACTED_SOURCE_BUFFER = string.ascii_letters + string.digits
+LEN_SOURCE_BUFFER = len(REDACTED_SOURCE_BUFFER)
+
+
+def get_redacted_source(length):
+    full_repeats = length // LEN_SOURCE_BUFFER
+    remainder = length % LEN_SOURCE_BUFFER
+    result = REDACTED_SOURCE_BUFFER * full_repeats + REDACTED_SOURCE_BUFFER[:remainder]
+    return result
 
 
 class SensitiveHandler:
@@ -218,7 +227,7 @@ def to_redacted_json(self, evidence_value, sensitive, tainted_ranges, sources):
                 if source_index < len(sources):
                     if not sources[source_index].redacted and self.is_sensible_source(sources[source_index]):
                         redacted_sources.append(source_index)
-                        sources[source_index].pattern = REDACTED_SOURCE_BUFFER[: len(sources[source_index].value)]
+                        sources[source_index].pattern = get_redacted_source(len(sources[source_index].value))
                         sources[source_index].redacted = True
 
                 if source_index in redacted_sources:
@@ -282,7 +291,7 @@ def redact_source(self, sources, redacted_sources, redacted_sources_context, sou
         if source_index is not None:
             if not sources[source_index].redacted:
                 redacted_sources.append(source_index)
-                sources[source_index].pattern = REDACTED_SOURCE_BUFFER[: len(sources[source_index].value)]
+                sources[source_index].pattern = get_redacted_source(len(sources[source_index].value))
                 sources[source_index].redacted = True
 
             if source_index not in redacted_sources_context.keys():
@@ -334,12 +343,12 @@ def write_redacted_value_part(
                         sensitive_start = 0
                     sensitive = _value[sensitive_start : _source_redaction_context["end"] - offset]
                     index_of_part_value_in_pattern = source.value.find(sensitive)
+
                     pattern = (
                         placeholder[index_of_part_value_in_pattern : index_of_part_value_in_pattern + len(sensitive)]
                         if index_of_part_value_in_pattern > -1
                         else placeholder[_source_redaction_context["start"] : _source_redaction_context["end"]]
                     )
-
                     value_parts.append({"redacted": True, "source": source_index, "pattern": pattern})
                     _value = _value[len(pattern) :]
                     offset += len(pattern)

@@ -4,31 +4,33 @@
 
 
 log = get_logger(__name__)
-AUTHORITY = r"^(?:[^:]+:)?//([^@]+)@"
-QUERY_FRAGMENT = r"[?#&]([^=&;]+)=([^?#&]+)"
-pattern = re.compile(f"({AUTHORITY})|({QUERY_FRAGMENT})", re.IGNORECASE | re.MULTILINE)
+AUTHORITY_PATTERN = re.compile(r"https?://([^@]+)(?=@)", re.IGNORECASE | re.MULTILINE)
+QUERY_FRAGMENT_PATTERN = re.compile(r"[?#&]([^=&;]+)=([^?#&]+)", re.IGNORECASE | re.MULTILINE)
 
 
-def url_sensitive_analyzer(evidence, name_pattern=None, value_pattern=None):
-    try:
-        ranges = []
-        regex_result = pattern.search(evidence.value)
+def find_authority(ranges, evidence):
+    regex_result = AUTHORITY_PATTERN.search(evidence.value)
+    while regex_result is not None:
+        if isinstance(regex_result.group(1), str):
+            start = regex_result.start(1)
+            end = regex_result.start(1) + (len(regex_result.group(1)))
+            ranges.append({"start": start, "end": end})
 
-        while regex_result is not None:
-            if isinstance(regex_result.group(1), str):
-                end = regex_result.start() + (len(regex_result.group(0)) - 1)
-                start = end - len(regex_result.group(1))
-                ranges.append({"start": start, "end": end})
+        regex_result = AUTHORITY_PATTERN.search(evidence.value, regex_result.end())
 
-            if isinstance(regex_result.group(3), str):
-                end = regex_result.start() + len(regex_result.group(0))
-                start = end - len(regex_result.group(3))
-                ranges.append({"start": start, "end": end})
 
-            regex_result = pattern.search(evidence.value, regex_result.end())
+def find_query_fragment(ranges, evidence):
+    regex_result = QUERY_FRAGMENT_PATTERN.search(evidence.value)
+    while regex_result is not None:
+        if isinstance(regex_result.group(2), str):
+            start = regex_result.start(2)
+            end = regex_result.start(2) + (len(regex_result.group(2)))
+            ranges.append({"start": start, "end": end})
+        regex_result = QUERY_FRAGMENT_PATTERN.search(evidence.value, regex_result.end())
 
-        return ranges
-    except Exception as e:
-        log.debug(e)
 
-    return []
+def url_sensitive_analyzer(evidence, name_pattern=None, value_pattern=None):
+    ranges = []
+    find_authority(ranges, evidence)
+    find_query_fragment(ranges, evidence)
+    return ranges
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Code Security: Ensure IAST SSRF vulnerability redacts the url query parameters correctly.
@@ -40,6 +40,7 @@ def test_cmdi_redaction_suite(evidence_input, sources_expected, vulnerabilities_
     source["origin"] = origin_to_str(source["origin"])
 
     assert vulnerability["type"] == VULN_CMDI
+    assert vulnerability["evidence"] == vulnerabilities_expected["evidence"]
     assert source == sources_expected
 
 

@@ -123,4 +123,5 @@ def test_header_injection_redaction_suite(
     source["origin"] = origin_to_str(source["origin"])
 
     assert vulnerability["type"] == VULN_HEADER_INJECTION
+    assert vulnerability["evidence"] == vulnerabilities_expected["evidence"]
     assert source == sources_expected
@@ -19,7 +19,9 @@
 
 
 # FIXME: ideally all these should pass, through the key is that we don't leak any potential PII
-_ignore_list = {46, 47}
+_ignore_list = {
+    46,
+}
 
 
 @pytest.mark.parametrize(

@@ -21,10 +21,9 @@
 
 
 @pytest.mark.parametrize(
-    "evidence_input, sources_expected, vulnerabilities_expected", list(get_parametrize(VULN_SSRF))[0:2]
+    "evidence_input, sources_expected, vulnerabilities_expected", list(get_parametrize(VULN_SSRF, ignore_list={9, 10}))
 )
 def test_ssrf_redaction_suite(evidence_input, sources_expected, vulnerabilities_expected, iast_context_defaults):
-    # TODO: fix get_parametrize(VULN_SSRF)[2:] replacements doesn't work correctly with params of SSRF
     tainted_object = evidence_input_value = evidence_input.get("value", "")
     if evidence_input_value:
         tainted_object = _taint_pyobject_multiranges(
@@ -49,6 +48,7 @@ def test_ssrf_redaction_suite(evidence_input, sources_expected, vulnerabilities_
     source["origin"] = origin_to_str(source["origin"])
 
     assert vulnerability["type"] == VULN_SSRF
+    assert vulnerability["evidence"] == vulnerabilities_expected["evidence"]
     assert source == sources_expected
 
 
@@ -71,15 +71,44 @@ def test_ssrf_redact_param(iast_context_defaults):
     assert result["vulnerabilities"]
     for v in result["vulnerabilities"]:
         assert v["evidence"]["valueParts"] == [
-            {"value": "https://www.domain1.com/"},
-            {"redacted": True},
+            {"value": "https://www.domain1.com/?id="},
             {"pattern": "abcdefgh", "redacted": True, "source": 0},
+            {"value": "&param2="},
             {"redacted": True},
+            {"value": "&param3="},
             {"redacted": True},
+            {"value": "&param3="},
             {"redacted": True},
         ]
 
 
+def test_ssrf_redact_params_log_url(iast_context_defaults):
+    url = (
+        "http://dovahkiin.dovahkiin.naal.ok/zin/los/vahriin/wahdein-vokul-mahfaeraak.ast-vaal-4ee5-aa01-3de356fd2cd8/snapshots/"
+        "f7591b64-9839-4be8-a3a9-e7ccf5c0e17e?schemaVersion=c27f6149-36b6-4f3a-bacb-ba4b8db36892"
+    )
+    tainted_source = taint_pyobject(pyobject=url, source_name="encoded_forward_api", source_value=url)
+
+    ev = Evidence(value=tainted_source)
+
+    loc = Location(path="foobar.py", line=35, spanId=123)
+    v = Vulnerability(type=VULN_SSRF, evidence=ev, location=loc)
+    report = IastSpanReporter(vulnerabilities={v})
+    report.add_ranges_to_evidence_and_extract_sources(v)
+    result = report.build_and_scrub_value_parts()
+
+    assert result["vulnerabilities"]
+    for v in result["vulnerabilities"]:
+        assert v["evidence"]["valueParts"] == [
+            {
+                "source": 0,
+                "value": "http://dovahkiin.dovahkiin.naal.ok/zin/los/vahriin/wahdein-vokul-mahfaeraak.ast-vaal-4ee5-aa01-3de356fd"
+                "2cd8/snapshots/f7591b64-9839-4be8-a3a9-e7ccf5c0e17e?schemaVersion=",
+            },
+            {"pattern": "TUVWXYZ0123456789abcdefghijklmnopqrs", "redacted": True, "source": 0},
+        ]
+
+
 def test_cmdi_redact_user_password(iast_context_defaults):
     user_taint_range = taint_pyobject(pyobject="root", source_name="username", source_value="root")
     password_taint_range = taint_pyobject(
@@ -107,7 +136,12 @@ def test_cmdi_redact_user_password(iast_context_defaults):
         assert v["evidence"]["valueParts"] == [
             {"value": "https://"},
             {"pattern": "abcd", "redacted": True, "source": 0},
-            {"value": ":"},
+            {"redacted": True},
             {"pattern": "abcdefghijklmnopqrs", "redacted": True, "source": 1},
-            {"value": "@domain1.com/?id=&param2=value2&param3=value3&param3=value3"},
+            {"value": "@domain1.com/?id=&param2="},
+            {"redacted": True},
+            {"value": "&param3="},
+            {"redacted": True},
+            {"value": "&param3="},
+            {"redacted": True},
         ]