Skip to content

Commit

Permalink
Workarounds for bugs in pygments shell lexer and first steps in filet…
Browse files Browse the repository at this point in the history
…ype guessing
  • Loading branch information
ntoskernel committed Jun 26, 2023
1 parent cb7f1a2 commit da84d62
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 4 deletions.
8 changes: 8 additions & 0 deletions deepsecrets/core/model/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from deepsecrets import logger
from deepsecrets.core.utils.fs import get_abspath
from deepsecrets.core.utils.guess_filetype import FileTypeGuesser


class File:
Expand All @@ -14,6 +15,7 @@ class File:
line_contents_cache: Dict[int, str] = {}
empty: bool
extension: Optional[str]
guessed_extension: Optional[str]

def __init__(
self,
Expand Down Expand Up @@ -41,6 +43,9 @@ def __init__(
self.length = len(self.content)

self.extension = self._get_extension()
if self.extension is None:
self.guessed_extension = self._try_guess_extension()

self.empty = True if self.length == 0 else False

if offsets is not None:
Expand All @@ -55,6 +60,9 @@ def _get_extension(self) -> Optional[str]:
return None

return by_dot[-1]

def _try_guess_extension(self) -> Optional[str]:
return FileTypeGuesser().guess(self.content)

def _calc_offsets(self) -> None:
line_breaks = [i.start() for i in re.finditer('\n', self.content)]
Expand Down
2 changes: 2 additions & 0 deletions deepsecrets/core/tokenizers/helpers/spot_improvements.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def _curl_argstring_breakdown(
return [current_token]

new_parts = current_token.content.split(':')
if new_parts[0] == '' or new_parts[1] == '':
return [current_token]

final = []
for part in new_parts:
Expand Down
25 changes: 21 additions & 4 deletions deepsecrets/core/tokenizers/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,31 @@ def sanitize(self, content: str) -> Union[str, bool]:

return content

def _find_lexer_for_file(self, file: File):
lexer = None
if file.extension is not None:
try:
lexer = get_lexer_for_filename(file.path)
return lexer
except ClassNotFound:
pass

if file.guessed_extension is not None:
try:
lexer = get_lexer_for_filename(f'{file.path}.{file.guessed_extension}')
return lexer
except ClassNotFound:
pass

return lexer



def tokenize(self, file: File, post_filter=True) -> List[Token]:
self.token_stream = ''
# TODO: don't trust the extension, use 'file' utility ?
try:
self.lexer = get_lexer_for_filename(file.path)
except ClassNotFound:
self.lexer = None

self.lexer = self._find_lexer_for_file(file)
if not self.lexer:
return self.tokens

Expand Down
49 changes: 49 additions & 0 deletions deepsecrets/core/utils/guess_filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
import tomllib
from typing import Optional


class FileTypeGuesser:

def __init__(self) -> None:
self.probes = {
'json': self._is_json,
'toml': self._is_toml,
}

def guess(self, content: str) -> Optional[str]:
for ext, probe in self.probes.items():
if probe(content):
return ext

# TODO: Guesslang
'''
ml_guesser = Guess()
guess = ml_guesser.language_name(content)
if not guess:
return None
for ext, name in ml_guesser._extension_map.items():
if name == guess:
return ext
'''
return None



def _is_json(self, content: str):
try:
json.loads(content)
except Exception as e:
return False

return True

def _is_toml(self, content: str):
try:
tomllib.loads(content)
except Exception as e:
return False

return True

3 changes: 3 additions & 0 deletions tests/core/engines/regex/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from deepsecrets.core.model.finding import Finding, FindingMerger, FindingResponse
from deepsecrets.core.rulesets.regex import RegexRulesetBuilder
from deepsecrets.core.tokenizers.full_content import FullContentTokenizer
from deepsecrets.core.tokenizers.lexer import LexerTokenizer
from deepsecrets.core.utils.fs import get_path_inside_package


Expand Down Expand Up @@ -61,6 +62,8 @@ def test_1(file: File, regex_engine: RegexEngine):
def test_extless(file_extless: File, regex_engine: RegexEngine):
findings: List[Finding] = []
tokens = FullContentTokenizer().tokenize(file_extless)
tokens_lex = LexerTokenizer(deep_token_inspection=True).tokenize(file_extless)

for token in tokens:
token_findings = regex_engine.search(token)
for finding in token_findings:
Expand Down
26 changes: 26 additions & 0 deletions tests/core/engines/semantic/test_semantic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import List
import pytest

from deepsecrets.core.engines.semantic import SemanticEngine
from deepsecrets.core.model.file import File
from deepsecrets.core.model.finding import Finding, FindingMerger
from deepsecrets.core.model.token import SemanticType
from deepsecrets.core.tokenizers.lexer import LexerTokenizer

Expand All @@ -26,6 +28,10 @@ def file_toml_2() -> File:
path = 'tests/fixtures/2.toml'
return File(path=path, relative_path=path)

@pytest.fixture(scope='module')
def file_sh_2() -> File:
path = 'tests/fixtures/2.sh'
return File(path=path, relative_path=path)


def test_1_semantic_engine(file: File):
Expand Down Expand Up @@ -95,3 +101,23 @@ def test_4_semantic_engine(file_toml_2: File):

assert len(findings) == 1
assert findings[0].rules[0].name == 'Var naming'


def test_5_semantic_engine(file_sh_2: File):
tokens = LexerTokenizer(deep_token_inspection=True).tokenize(file_sh_2)
assert len(tokens) == 16

engine = SemanticEngine(subengine=None)

findings: List[Finding] = []
for token in tokens:
findings.extend(engine.search(token))

for finding in findings:
finding.map_on_file(file=file_sh_2, relative_start=finding.start_pos)
finding.choose_final_rule()


findings = FindingMerger(findings).merge()
assert len(findings) == 1
assert findings[0].final_rule.name == 'Dangerous condition'
6 changes: 6 additions & 0 deletions tests/fixtures/2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

curl -u 'login01:password01' -s "https://example.com/test"
# should not be matched as a variable
curl -u 'login01:$password_var' -s "https://example.com/test"
curl -u "qauser:$password" $URL > $FILENAME

0 comments on commit da84d62

Please sign in to comment.