From f394255f9e198fa59487042ee19f543e6017cf3e Mon Sep 17 00:00:00 2001 From: Sander Maijers <3374183+sanmai-NL@users.noreply.github.com> Date: Thu, 18 May 2023 12:57:04 +0200 Subject: [PATCH 1/2] Only check for `@generated`-like markers in file header Check a large but limited buffer from the start of the file. Do not assume UTF-8 encoding and text decode at all, but search for byte string. First check for the marker with the highest priority, to short-circuit return when it is found. This prevents performance problems with large files --- CHANGELOG.md | 1 + megalinter/utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2238225fbc3..1fa30293268 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Note: Can be used with `oxsecurity/megalinter@beta` in your GitHub Action mega-l - Core - mega-linter-runner: Convert to ES6 and upgrade npm dependencies - Add rust to checkov as it is a required dependency (to do that, allow to define empty string packages as cargo dependencies in descriptors) + - Optimize `@generated` marker scanning ([#2654](https://github.com/oxsecurity/megalinter/pull/2654)) - Media - [Achieve Code Consistency: MegaLinter Integration in Azure DevOps](https://techcommunity.microsoft.com/t5/azure-devops-blog/achieve-code-consistency-megalinter-integration-in-azure-devops/ba-p/3939448), by [Don Koning](https://techcommunity.microsoft.com/t5/user/viewprofilepage/user-id/2039143#profile) on [Microsoft Tech Community](https://techcommunity.microsoft.com/) diff --git a/megalinter/utils.py b/megalinter/utils.py index 639abceece4..fbf6ee40b68 100644 --- a/megalinter/utils.py +++ b/megalinter/utils.py @@ -14,6 +14,8 @@ from megalinter import config from megalinter.constants import DEFAULT_DOCKER_WORKSPACE_DIR +SIZE_MAX_SOURCEFILEHEADER = 1024 + REPO_HOME_DEFAULT = ( DEFAULT_DOCKER_WORKSPACE_DIR if os.path.isdir(DEFAULT_DOCKER_WORKSPACE_DIR) @@ -278,10 +280,9 @@ def file_contains(file_name: str, regex_object: Optional[Pattern[str]]) -> bool: def file_is_generated(file_name: str) -> bool: - with open(file_name, "r", encoding="utf-8", errors="ignore") as f: - content = f.read() - is_generated = "@generated" in content and "@not-generated" not in content - return is_generated + with open(file_name, "rb") as f: + content = f.read(SIZE_MAX_SOURCEFILEHEADER) + return b"@generated" in content and b"@not-generated" not in content def decode_utf8(stdout): From c44f2993ef9060988426be1d334f9fae8e9216a0 Mon Sep 17 00:00:00 2001 From: Sander Maijers <3374183+sanmai-NL@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:06:21 +0200 Subject: [PATCH 2/2] Fix CSpell linter fault --- .cspell.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cspell.json b/.cspell.json index 6837e725c29..68dfda54e3c 100644 --- a/.cspell.json +++ b/.cspell.json @@ -405,6 +405,7 @@ "SHFMT", "SNAKEFMT", "SOQL", + "SOURCEFILEHEADER", "SOURCEPATHS", "SQLFLUFF", "STDLIB", @@ -1465,4 +1466,4 @@ "zaach", "zricethezav" ] -} \ No newline at end of file +}