Skip to content

Commit

Permalink
markup for %3D separator (#175)
Browse files Browse the repository at this point in the history
* 399221f4.java

* 6aae1316

* 6aae1316

* 0f133e09

* upd

* done

* upd

* --fix

* rollback

* fix miss

* correction

* correction2

* True
  • Loading branch information
babenek authored Dec 9, 2024
1 parent 37d6577 commit f144fcb
Show file tree
Hide file tree
Showing 14 changed files with 100 additions and 45 deletions.
18 changes: 9 additions & 9 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
META MD5 414228344bac7e55c5127be7b244e460
DATA MD5 abd9c025d5c323af814fbeb33f469c90
DATA: 16342283 interested lines. MARKUP: 62020 items
META MD5 5bb0a05fd77c2761b8414bba41103939
DATA MD5 9e77a2d9f718f175264ab5a386ae86c4
DATA: 16342283 interested lines. MARKUP: 62022 items
FileType FileNumber ValidLines Positives Negatives Templates
--------------- ------------ ------------ ----------- ----------- -----------
194 28318 71 418 90
Expand Down Expand Up @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 5
.j 1 241 4
.j2 30 5530 6 186 10
.java 621 134132 362 1365 171
.java 621 134132 368 1365 171
.jenkinsfile 1 58 2 6
.jinja2 1 64 2
.js 659 536413 531 2497 331
Expand Down Expand Up @@ -222,7 +222,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.yml 419 36169 559 889 376
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10232 16342283 12255 49692 5101
TOTAL: 10232 16342283 12261 49692 5101
credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ----
Expand All @@ -231,7 +231,7 @@ AWS Client ID 168 21 0
AWS Multi 82 10 0 0 0 10 82 0.000000 1.000000 0.108696 0.000000
AWS S3 Bucket 67 23 0 0 0 23 67 0.000000 1.000000 0.255556 0.000000
Atlassian Old PAT token 27 308 3 0 0 311 27 0.000000 1.000000 0.920118 0.000000
Auth 414 2739 82 0 0 2821 414 0.000000 1.000000 0.872025 0.000000
Auth 417 2739 82 0 0 2821 417 0.000000 1.000000 0.871217 0.000000
Azure Access Token 19 0 0 0 0 0 19 1.000000 0.000000 0.000000
BASE64 Private Key 7 4 0 0 0 4 7 0.000000 1.000000 0.363636 0.000000
BASE64 encoded PEM Private Key 7 0 0 0 0 0 7 1.000000 0.000000 0.000000
Expand All @@ -258,7 +258,7 @@ JSON Web Token 170 61 0
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 0 0 7 15 0.000000 1.000000 0.318182 0.000000
Key 3909 15717 485 0 0 16202 3909 0.000000 1.000000 0.805629 0.000000
Nonce 91 49 0 0 0 49 91 0.000000 1.000000 0.350000 0.000000
Nonce 93 49 0 0 0 49 93 0.000000 1.000000 0.345070 0.000000
Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000
PEM Private Key 1019 1483 0 0 0 1483 1019 0.000000 1.000000 0.592726 0.000000
Password 1869 7536 2680 0 0 10216 1869 0.000000 1.000000 0.845345 0.000000
Expand All @@ -267,8 +267,8 @@ Secret 1297 1576 802
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 0 0 1 4 0.000000 1.000000 0.200000 0.000000
Tencent WeChat API App ID 6 0 0 0 0 0 6 1.000000 0.000000 0.000000
Token 643 4170 454 0 0 4624 643 0.000000 1.000000 0.877919 0.000000
Token 644 4170 454 0 0 4624 644 0.000000 1.000000 0.877752 0.000000
Twilio Credentials 30 39 0 0 0 39 30 0.000000 1.000000 0.565217 0.000000
URL Credentials 210 157 215 0 0 372 210 0.000000 1.000000 0.639175 0.000000
UUID 1069 265 0 0 0 265 1069 0.000000 1.000000 0.198651 0.000000
12255 49692 5101 0 0 0 49692 12255 0.000000 1.000000 0.802170 0.000000
12261 49692 5101 0 0 0 49692 12261 0.000000 1.000000 0.802092 0.000000
5 changes: 4 additions & 1 deletion benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ def get_arguments() -> Namespace:
parser.add_argument("--load",
help=f"skip scan and use prepared output",
dest="load")
parser.add_argument("--fix",
help=f"add/update markup for unknown credetials",
action="store_true")
return parser.parse_args()


def main() -> None:
args = get_arguments()
benchmark = Benchmark()
if args.scanner in SCANNER_LIST:
benchmark.run(args.scanner, args.load)
benchmark.run(args.scanner, args.load, args.fix)
else:
print(f"Please check scanner name (support: {SCANNER_LIST})")

Expand Down
8 changes: 6 additions & 2 deletions benchmark/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,13 @@ def set_cred_data(self) -> str:
subprocess.call(["./venv/bin/python", "download_data.py", "--data_dir", "data"], cwd=cred_data_path)
return cred_data_path

def run(self, scanner_type: str, output: Optional[str] = None) -> None:
def run(self, scanner_type: str, output: Optional[str] = None, fix: Optional[bool] = None) -> None:
if _scanner_type := getattr(ScannerType, scanner_type.strip().upper(), None):
scanner = ScannerFactory.create_scanner(_scanner_type, self.working_dir, self.cred_data_path, bool(output))
scanner = ScannerFactory.create_scanner(_scanner_type,
self.working_dir,
self.cred_data_path,
bool(output),
bool(fix))
else:
raise RuntimeError(f"Wrong scanner_type='{scanner_type}'")
if output:
Expand Down
7 changes: 4 additions & 3 deletions benchmark/scanner/credential_digger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

class CredentialDigger(Scanner):

def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.db"
self.working_dir: str = working_dir

Expand Down Expand Up @@ -64,4 +64,5 @@ def parse_result(self) -> None:
line_data = {"file_name": data[1], "line_number": data[2]}
if line_data["file_name"].split("/")[-1] == "LICENSE" or "COPYING" in line_data["file_name"].split("/")[-1]:
continue
_, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"], line_data["line_number"])
_, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"],
line_data["line_number"])
4 changes: 2 additions & 2 deletions benchmark/scanner/credsweeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class CredSweeper(Scanner):
LineStatus.NOT_IN_DB: 'N',
LineStatus.CHECKED: 'C'}

def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix:bool) -> None:
super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
4 changes: 2 additions & 2 deletions benchmark/scanner/detect_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


class DetectSecrets(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
6 changes: 3 additions & 3 deletions benchmark/scanner/gitleaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class Gitleaks(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand All @@ -26,7 +26,7 @@ def init_scanner(self) -> None:
def run_scanner(self) -> None:
self.init_scanner()
subprocess.call([self.gitleaks_path, "--no-git", "-p"
f"{self.cred_data_dir}/data", "-o", self.output_dir],
f"{self.cred_data_dir}/data", "-o", self.output_dir],
cwd=self.scanner_dir)

def parse_result(self) -> None:
Expand Down
39 changes: 38 additions & 1 deletion benchmark/scanner/scanner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import binascii
import hashlib
import os
import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Tuple, Dict, List, Any
Expand All @@ -16,10 +17,11 @@

class Scanner(ABC):
def __init__(self, scanner_type: ScannerType, scanner_url: str, working_dir: str, cred_data_dir: str,
preload: bool) -> None:
preload: bool, fix: bool) -> None:
self.scanner_type = scanner_type
self.scanner_dir: str = GitService.set_scanner_up_to_date(working_dir, scanner_url, preload)
self.cred_data_dir: str = cred_data_dir
self.fix = fix
self.line_checker: set = set()
self.result_cnt: int = 0
self.lost_cnt: int = 0
Expand Down Expand Up @@ -263,6 +265,36 @@ def check_line_from_meta(self,
self.lost_cnt += 1
self.meta_next_id += 1
print(f"NOT FOUND WITH KEY: {approximate}", flush=True)
if self.fix:
with open(f"{self.cred_data_dir}/meta/{project_id}.csv", "a") as f:
f.write(f"{str(approximate)}\n")
lost_meta = MetaRow({
"Id": self.meta_next_id,
"FileID": file_id,
"Domain": "GitHub",
"RepoName": project_id,
"FilePath": data_path,
"LineStart": line_start,
"LineEnd": line_end,
"GroundTruth": 'F',
"WithWords": 'F',
"ValueStart": value_start,
"ValueEnd": value_end,
"InURL": 'F',
"InRuntimeParameter": 'F',
"CharacterSet": '',
"CryptographyKey": '',
"PredefinedPattern": '',
"VariableNameType": '',
"Entropy": 0.0,
"Length": 0,
"Base64Encode": 'F',
"HexEncode": 'F',
"URLEncode": 'F',
"Category": rule
})
self.meta[MetaKey(data_path, line_start, line_end)] = [lost_meta]

return LineStatus.NOT_IN_DB, project_id, file_id

suggestion = "LOST:"
Expand Down Expand Up @@ -326,6 +358,11 @@ def check_line_from_meta(self,
return LineStatus.TRUE, project_id, file_id
else:
print(f"WARNING: '{rule}' is not mentioned in {row}")
if self.fix:
subprocess.check_call(
["sed", "-i",
f"s/{row.Id},\\(.*\\)/{row.Id},\\1:{rule}/",
f"{self.cred_data_dir}/meta/{row.RepoName}.csv"])
# meta has no markup for given credential
self.lost_cnt += 1
print(f"{suggestion} {approximate}", flush=True)
Expand Down
23 changes: 14 additions & 9 deletions benchmark/scanner/scanner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,33 @@

class ScannerFactory:
@classmethod
def create_scanner(cls, scanner_type: ScannerType, working_dir: str, cred_data_dir: str, preload: bool) -> Scanner:
def create_scanner(cls,
scanner_type: ScannerType,
working_dir: str,
cred_data_dir: str,
preload: bool,
fix: bool) -> Scanner:
if scanner_type == ScannerType.CREDSWEEPER:
from benchmark.scanner import CredSweeper
return CredSweeper(working_dir, cred_data_dir, preload)
return CredSweeper(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.DETECT_SECRETS:
from benchmark.scanner import DetectSecrets
return DetectSecrets(working_dir, cred_data_dir, preload)
return DetectSecrets(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.GITLEAKS:
from benchmark.scanner import Gitleaks
return Gitleaks(working_dir, cred_data_dir, preload)
return Gitleaks(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.SHHGIT:
from benchmark.scanner import Shhgit
return Shhgit(working_dir, cred_data_dir, preload)
return Shhgit(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.CREDENTIAL_DIGGER:
from benchmark.scanner import CredentialDigger
return CredentialDigger(working_dir, cred_data_dir, preload)
return CredentialDigger(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.WRAITH:
from benchmark.scanner import Wraith
return Wraith(working_dir, cred_data_dir, preload)
return Wraith(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.TRUFFLEHOG3:
from benchmark.scanner import TruffleHog3
return TruffleHog3(working_dir, cred_data_dir, preload)
return TruffleHog3(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.TRUFFLEHOG:
from benchmark.scanner import TruffleHog
return TruffleHog(working_dir, cred_data_dir, preload)
return TruffleHog(working_dir, cred_data_dir, preload, fix)
4 changes: 2 additions & 2 deletions benchmark/scanner/shhgit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class Shhgit(Scanner):
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload, fix)
self.output_dir = f"{self.scanner_dir}/output.csv"

@property
Expand Down
4 changes: 2 additions & 2 deletions benchmark/scanner/trufflehog.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


class TruffleHog(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
9 changes: 5 additions & 4 deletions benchmark/scanner/trufflehog3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class TruffleHog3(Scanner):
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload, fix)
self.output_dir = f"{self.scanner_dir}/output.json"
if os.path.exists(self.output_dir):
os.remove(self.output_dir)
Expand All @@ -33,7 +33,7 @@ def run_scanner(self) -> None:
"./venv/bin/trufflehog3", f"{self.cred_data_dir}/data/", "-o", self.output_dir, "-f", "json",
"--line-numbers"
],
cwd=self.scanner_dir)
cwd=self.scanner_dir)

def parse_result(self) -> None:
with open(self.output_dir, "r") as f:
Expand All @@ -44,4 +44,5 @@ def parse_result(self) -> None:
line_data = {"path": data["path"], "line_number": int(line.split(" ")[0])}
if line_data["path"].split("/")[-1] == "LICENSE":
continue
_, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"], line_data["line_number"])
_, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"],
line_data["line_number"])
Loading

0 comments on commit f144fcb

Please sign in to comment.