From f144fcb33870774519db04190ec9cae6ede7ba3e Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Mon, 9 Dec 2024 13:55:15 +0200 Subject: [PATCH] markup for %3D separator (#175) * 399221f4.java * 6aae1316 * 6aae1316 * 0f133e09 * upd * done * upd * --fix * rollback * fix miss * correction * correction2 * True --- .ci/benchmark.txt | 18 ++++++------ benchmark/__main__.py | 5 +++- benchmark/app.py | 8 ++++-- benchmark/scanner/credential_digger.py | 7 +++-- benchmark/scanner/credsweeper.py | 4 +-- benchmark/scanner/detect_secrets.py | 4 +-- benchmark/scanner/gitleaks.py | 6 ++-- benchmark/scanner/scanner.py | 39 +++++++++++++++++++++++++- benchmark/scanner/scanner_factory.py | 23 +++++++++------ benchmark/scanner/shhgit.py | 4 +-- benchmark/scanner/trufflehog.py | 4 +-- benchmark/scanner/trufflehog3.py | 9 +++--- benchmark/scanner/wraith.py | 11 ++++---- meta/ec138349.csv | 3 ++ 14 files changed, 100 insertions(+), 45 deletions(-) diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index 14ce3f0c5..d1092ee57 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -1,6 +1,6 @@ -META MD5 414228344bac7e55c5127be7b244e460 -DATA MD5 abd9c025d5c323af814fbeb33f469c90 -DATA: 16342283 interested lines. MARKUP: 62020 items +META MD5 5bb0a05fd77c2761b8414bba41103939 +DATA MD5 9e77a2d9f718f175264ab5a386ae86c4 +DATA: 16342283 interested lines. MARKUP: 62022 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 71 418 90 @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 134 5 .j 1 241 4 .j2 30 5530 6 186 10 -.java 621 134132 362 1365 171 +.java 621 134132 368 1365 171 .jenkinsfile 1 58 2 6 .jinja2 1 64 2 .js 659 536413 531 2497 331 @@ -222,7 +222,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 419 36169 559 889 376 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10232 16342283 12255 49692 5101 +TOTAL: 10232 16342283 12261 49692 5101 credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ---- @@ -231,7 +231,7 @@ AWS Client ID 168 21 0 AWS Multi 82 10 0 0 0 10 82 0.000000 1.000000 0.108696 0.000000 AWS S3 Bucket 67 23 0 0 0 23 67 0.000000 1.000000 0.255556 0.000000 Atlassian Old PAT token 27 308 3 0 0 311 27 0.000000 1.000000 0.920118 0.000000 -Auth 414 2739 82 0 0 2821 414 0.000000 1.000000 0.872025 0.000000 +Auth 417 2739 82 0 0 2821 417 0.000000 1.000000 0.871217 0.000000 Azure Access Token 19 0 0 0 0 0 19 1.000000 0.000000 0.000000 BASE64 Private Key 7 4 0 0 0 4 7 0.000000 1.000000 0.363636 0.000000 BASE64 encoded PEM Private Key 7 0 0 0 0 0 7 1.000000 0.000000 0.000000 @@ -258,7 +258,7 @@ JSON Web Token 170 61 0 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 15 6 1 0 0 7 15 0.000000 1.000000 0.318182 0.000000 Key 3909 15717 485 0 0 16202 3909 0.000000 1.000000 0.805629 0.000000 -Nonce 91 49 0 0 0 49 91 0.000000 1.000000 0.350000 0.000000 +Nonce 93 49 0 0 0 49 93 0.000000 1.000000 0.345070 0.000000 Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000 PEM Private Key 1019 1483 0 0 0 1483 1019 0.000000 1.000000 0.592726 0.000000 Password 1869 7536 2680 0 0 10216 1869 0.000000 1.000000 0.845345 0.000000 @@ -267,8 +267,8 @@ Secret 1297 1576 802 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 0 0 1 4 0.000000 1.000000 0.200000 0.000000 Tencent WeChat API App ID 6 0 0 0 0 0 6 1.000000 0.000000 0.000000 -Token 643 4170 454 0 0 4624 643 0.000000 1.000000 0.877919 0.000000 +Token 644 4170 454 0 0 4624 644 0.000000 1.000000 0.877752 0.000000 Twilio Credentials 30 39 0 0 0 39 30 0.000000 1.000000 0.565217 0.000000 URL Credentials 210 157 215 0 0 372 210 0.000000 1.000000 0.639175 0.000000 UUID 1069 265 0 0 0 265 1069 0.000000 1.000000 0.198651 0.000000 - 12255 49692 5101 0 0 0 49692 12255 0.000000 1.000000 0.802170 0.000000 + 12261 49692 5101 0 0 0 49692 12261 0.000000 1.000000 0.802092 0.000000 diff --git a/benchmark/__main__.py b/benchmark/__main__.py index 285d4cacd..7c9f4f20e 100644 --- a/benchmark/__main__.py +++ b/benchmark/__main__.py @@ -18,6 +18,9 @@ def get_arguments() -> Namespace: parser.add_argument("--load", help=f"skip scan and use prepared output", dest="load") + parser.add_argument("--fix", + help=f"add/update markup for unknown credetials", + action="store_true") return parser.parse_args() @@ -25,7 +28,7 @@ def main() -> None: args = get_arguments() benchmark = Benchmark() if args.scanner in SCANNER_LIST: - benchmark.run(args.scanner, args.load) + benchmark.run(args.scanner, args.load, args.fix) else: print(f"Please check scanner name (support: {SCANNER_LIST})") diff --git a/benchmark/app.py b/benchmark/app.py index 6691829e6..b2f688a01 100644 --- a/benchmark/app.py +++ b/benchmark/app.py @@ -35,9 +35,13 @@ def set_cred_data(self) -> str: subprocess.call(["./venv/bin/python", "download_data.py", "--data_dir", "data"], cwd=cred_data_path) return cred_data_path - def run(self, scanner_type: str, output: Optional[str] = None) -> None: + def run(self, scanner_type: str, output: Optional[str] = None, fix: Optional[bool] = None) -> None: if _scanner_type := getattr(ScannerType, scanner_type.strip().upper(), None): - scanner = ScannerFactory.create_scanner(_scanner_type, self.working_dir, self.cred_data_path, bool(output)) + scanner = ScannerFactory.create_scanner(_scanner_type, + self.working_dir, + self.cred_data_path, + bool(output), + bool(fix)) else: raise RuntimeError(f"Wrong scanner_type='{scanner_type}'") if output: diff --git a/benchmark/scanner/credential_digger.py b/benchmark/scanner/credential_digger.py index 63206ed13..736efc71f 100644 --- a/benchmark/scanner/credential_digger.py +++ b/benchmark/scanner/credential_digger.py @@ -8,8 +8,8 @@ class CredentialDigger(Scanner): - def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None: - super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload) + def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None: + super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload, fix) self.output_dir: str = f"{self.scanner_dir}/output.db" self.working_dir: str = working_dir @@ -64,4 +64,5 @@ def parse_result(self) -> None: line_data = {"file_name": data[1], "line_number": data[2]} if line_data["file_name"].split("/")[-1] == "LICENSE" or "COPYING" in line_data["file_name"].split("/")[-1]: continue - _, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"], line_data["line_number"]) + _, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"], + line_data["line_number"]) diff --git a/benchmark/scanner/credsweeper.py b/benchmark/scanner/credsweeper.py index be69bf7ac..5d34b824a 100644 --- a/benchmark/scanner/credsweeper.py +++ b/benchmark/scanner/credsweeper.py @@ -12,8 +12,8 @@ class CredSweeper(Scanner): LineStatus.NOT_IN_DB: 'N', LineStatus.CHECKED: 'C'} - def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None: - super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload) + def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix:bool) -> None: + super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload, fix) self.output_dir: str = f"{self.scanner_dir}/output.json" @property diff --git a/benchmark/scanner/detect_secrets.py b/benchmark/scanner/detect_secrets.py index 617fde616..2afe88b29 100644 --- a/benchmark/scanner/detect_secrets.py +++ b/benchmark/scanner/detect_secrets.py @@ -9,8 +9,8 @@ class DetectSecrets(Scanner): - def __init__(self, working_dir, cred_data_dir, preload: bool): - super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload) + def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool): + super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload, fix) self.output_dir: str = f"{self.scanner_dir}/output.json" @property diff --git a/benchmark/scanner/gitleaks.py b/benchmark/scanner/gitleaks.py index 3fe1701f1..978ff9622 100644 --- a/benchmark/scanner/gitleaks.py +++ b/benchmark/scanner/gitleaks.py @@ -8,8 +8,8 @@ class Gitleaks(Scanner): - def __init__(self, working_dir, cred_data_dir, preload: bool): - super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload) + def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool): + super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload, fix) self.output_dir: str = f"{self.scanner_dir}/output.json" @property @@ -26,7 +26,7 @@ def init_scanner(self) -> None: def run_scanner(self) -> None: self.init_scanner() subprocess.call([self.gitleaks_path, "--no-git", "-p" - f"{self.cred_data_dir}/data", "-o", self.output_dir], + f"{self.cred_data_dir}/data", "-o", self.output_dir], cwd=self.scanner_dir) def parse_result(self) -> None: diff --git a/benchmark/scanner/scanner.py b/benchmark/scanner/scanner.py index 7728c1e1b..81c68c966 100644 --- a/benchmark/scanner/scanner.py +++ b/benchmark/scanner/scanner.py @@ -1,6 +1,7 @@ import binascii import hashlib import os +import subprocess from abc import ABC, abstractmethod from pathlib import Path from typing import Tuple, Dict, List, Any @@ -16,10 +17,11 @@ class Scanner(ABC): def __init__(self, scanner_type: ScannerType, scanner_url: str, working_dir: str, cred_data_dir: str, - preload: bool) -> None: + preload: bool, fix: bool) -> None: self.scanner_type = scanner_type self.scanner_dir: str = GitService.set_scanner_up_to_date(working_dir, scanner_url, preload) self.cred_data_dir: str = cred_data_dir + self.fix = fix self.line_checker: set = set() self.result_cnt: int = 0 self.lost_cnt: int = 0 @@ -263,6 +265,36 @@ def check_line_from_meta(self, self.lost_cnt += 1 self.meta_next_id += 1 print(f"NOT FOUND WITH KEY: {approximate}", flush=True) + if self.fix: + with open(f"{self.cred_data_dir}/meta/{project_id}.csv", "a") as f: + f.write(f"{str(approximate)}\n") + lost_meta = MetaRow({ + "Id": self.meta_next_id, + "FileID": file_id, + "Domain": "GitHub", + "RepoName": project_id, + "FilePath": data_path, + "LineStart": line_start, + "LineEnd": line_end, + "GroundTruth": 'F', + "WithWords": 'F', + "ValueStart": value_start, + "ValueEnd": value_end, + "InURL": 'F', + "InRuntimeParameter": 'F', + "CharacterSet": '', + "CryptographyKey": '', + "PredefinedPattern": '', + "VariableNameType": '', + "Entropy": 0.0, + "Length": 0, + "Base64Encode": 'F', + "HexEncode": 'F', + "URLEncode": 'F', + "Category": rule + }) + self.meta[MetaKey(data_path, line_start, line_end)] = [lost_meta] + return LineStatus.NOT_IN_DB, project_id, file_id suggestion = "LOST:" @@ -326,6 +358,11 @@ def check_line_from_meta(self, return LineStatus.TRUE, project_id, file_id else: print(f"WARNING: '{rule}' is not mentioned in {row}") + if self.fix: + subprocess.check_call( + ["sed", "-i", + f"s/{row.Id},\\(.*\\)/{row.Id},\\1:{rule}/", + f"{self.cred_data_dir}/meta/{row.RepoName}.csv"]) # meta has no markup for given credential self.lost_cnt += 1 print(f"{suggestion} {approximate}", flush=True) diff --git a/benchmark/scanner/scanner_factory.py b/benchmark/scanner/scanner_factory.py index 3f195c5e0..78a447aa7 100644 --- a/benchmark/scanner/scanner_factory.py +++ b/benchmark/scanner/scanner_factory.py @@ -4,28 +4,33 @@ class ScannerFactory: @classmethod - def create_scanner(cls, scanner_type: ScannerType, working_dir: str, cred_data_dir: str, preload: bool) -> Scanner: + def create_scanner(cls, + scanner_type: ScannerType, + working_dir: str, + cred_data_dir: str, + preload: bool, + fix: bool) -> Scanner: if scanner_type == ScannerType.CREDSWEEPER: from benchmark.scanner import CredSweeper - return CredSweeper(working_dir, cred_data_dir, preload) + return CredSweeper(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.DETECT_SECRETS: from benchmark.scanner import DetectSecrets - return DetectSecrets(working_dir, cred_data_dir, preload) + return DetectSecrets(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.GITLEAKS: from benchmark.scanner import Gitleaks - return Gitleaks(working_dir, cred_data_dir, preload) + return Gitleaks(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.SHHGIT: from benchmark.scanner import Shhgit - return Shhgit(working_dir, cred_data_dir, preload) + return Shhgit(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.CREDENTIAL_DIGGER: from benchmark.scanner import CredentialDigger - return CredentialDigger(working_dir, cred_data_dir, preload) + return CredentialDigger(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.WRAITH: from benchmark.scanner import Wraith - return Wraith(working_dir, cred_data_dir, preload) + return Wraith(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.TRUFFLEHOG3: from benchmark.scanner import TruffleHog3 - return TruffleHog3(working_dir, cred_data_dir, preload) + return TruffleHog3(working_dir, cred_data_dir, preload, fix) elif scanner_type == ScannerType.TRUFFLEHOG: from benchmark.scanner import TruffleHog - return TruffleHog(working_dir, cred_data_dir, preload) + return TruffleHog(working_dir, cred_data_dir, preload, fix) diff --git a/benchmark/scanner/shhgit.py b/benchmark/scanner/shhgit.py index c37a8989f..e1d6abc70 100644 --- a/benchmark/scanner/shhgit.py +++ b/benchmark/scanner/shhgit.py @@ -8,8 +8,8 @@ class Shhgit(Scanner): - def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None: - super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload) + def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None: + super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload, fix) self.output_dir = f"{self.scanner_dir}/output.csv" @property diff --git a/benchmark/scanner/trufflehog.py b/benchmark/scanner/trufflehog.py index a71d450ef..72af6c718 100644 --- a/benchmark/scanner/trufflehog.py +++ b/benchmark/scanner/trufflehog.py @@ -9,8 +9,8 @@ class TruffleHog(Scanner): - def __init__(self, working_dir, cred_data_dir, preload: bool): - super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload) + def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool): + super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload, fix) self.output_dir: str = f"{self.scanner_dir}/output.json" @property diff --git a/benchmark/scanner/trufflehog3.py b/benchmark/scanner/trufflehog3.py index d291cf0a7..5e4a3fc03 100644 --- a/benchmark/scanner/trufflehog3.py +++ b/benchmark/scanner/trufflehog3.py @@ -8,8 +8,8 @@ class TruffleHog3(Scanner): - def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None: - super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload) + def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None: + super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload, fix) self.output_dir = f"{self.scanner_dir}/output.json" if os.path.exists(self.output_dir): os.remove(self.output_dir) @@ -33,7 +33,7 @@ def run_scanner(self) -> None: "./venv/bin/trufflehog3", f"{self.cred_data_dir}/data/", "-o", self.output_dir, "-f", "json", "--line-numbers" ], - cwd=self.scanner_dir) + cwd=self.scanner_dir) def parse_result(self) -> None: with open(self.output_dir, "r") as f: @@ -44,4 +44,5 @@ def parse_result(self) -> None: line_data = {"path": data["path"], "line_number": int(line.split(" ")[0])} if line_data["path"].split("/")[-1] == "LICENSE": continue - _, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"], line_data["line_number"]) + _, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"], + line_data["line_number"]) diff --git a/benchmark/scanner/wraith.py b/benchmark/scanner/wraith.py index 245ad1add..ca1fd5da4 100644 --- a/benchmark/scanner/wraith.py +++ b/benchmark/scanner/wraith.py @@ -8,8 +8,8 @@ class Wraith(Scanner): - def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None: - super().__init__(ScannerType.WRAITH, URL.WRAITH, working_dir, cred_data_dir, preload) + def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None: + super().__init__(ScannerType.WRAITH, URL.WRAITH, working_dir, cred_data_dir, preload, fix) self.output_dir = f"{self.scanner_dir}/output.json" self.working_dir = working_dir @@ -40,8 +40,8 @@ def run_scanner(self) -> None: f"{self.cred_data_dir}/data/", "--scan-tests", "--json", "--num-threads", str(os.cpu_count() * 2) ], - cwd=self.scanner_dir, - universal_newlines=True) + cwd=self.scanner_dir, + universal_newlines=True) with open(self.output_dir, "w") as f: f.write(self.output_lines) @@ -53,4 +53,5 @@ def parse_result(self) -> None: if line_data["FilePath"].split("/")[-1] == "LICENSE": continue - _, _, _ = self.check_line_from_meta(line_data["FilePath"], int(line_data["LineNumber"]), int(line_data["LineNumber"])) + _, _, _ = self.check_line_from_meta(line_data["FilePath"], int(line_data["LineNumber"]), + int(line_data["LineNumber"])) diff --git a/meta/ec138349.csv b/meta/ec138349.csv index 6d71f39b2..acd04d719 100644 --- a/meta/ec138349.csv +++ b/meta/ec138349.csv @@ -184,3 +184,6 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 1338530,18b43943,GitHub,ec138349,data/ec138349/test/18b43943.java,47,47,F,F,6,101,F,F,,,,,0.0,0,F,F,F,JSON Web Token 1338573,2f9b15a9,GitHub,ec138349,data/ec138349/test/2f9b15a9.java,125,125,Template,F,58,65,F,F,,,,,0.0,0,F,F,F,Auth:Token 1338575,2f9b15a9,GitHub,ec138349,data/ec138349/test/2f9b15a9.java,158,158,F,F,58,68,F,F,,,,,0.0,0,F,F,F,Auth:Token +1480452,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,135,171,F,F,,,,,0.0,0,F,F,F,Auth:Nonce +1480456,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,64,64,T,F,148,167,F,F,,,,,0.0,0,F,F,F,Auth:Nonce +1480457,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,256,292,F,F,,,,,0.0,0,F,F,F,Auth:Token