diff --git a/CHANGELOG.md b/CHANGELOG.md index 904ebd61..a62ff75e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Upcoming changes... +## [1.36.0] - 2025-10-08 +### Added +- Add `--recursive-threshold` argument to folder scan command +- Add `--depth` argument to `folder-scan` and `folder-hash` commands + ## [1.35.0] - 2025-10-07 ### Modified - Use gRPC instead of REST for API calls @@ -677,3 +682,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [1.32.0]: https://github.com/scanoss/scanoss.py/compare/v1.31.5...v1.32.0 [1.33.0]: https://github.com/scanoss/scanoss.py/compare/v1.32.0...v1.33.0 [1.34.0]: https://github.com/scanoss/scanoss.py/compare/v1.33.0...v1.34.0 +[1.35.0]: https://github.com/scanoss/scanoss.py/compare/v1.34.0...v1.35.0 +[1.36.0]: https://github.com/scanoss/scanoss.py/compare/v1.35.0...v1.36.0 diff --git a/src/scanoss/__init__.py b/src/scanoss/__init__.py index 805b3eea..de19d8b4 100644 --- a/src/scanoss/__init__.py +++ b/src/scanoss/__init__.py @@ -22,4 +22,4 @@ THE SOFTWARE. """ -__version__ = '1.35.0' +__version__ = '1.36.0' diff --git a/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py b/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py index addf36f2..13118697 100644 --- a/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py +++ b/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py @@ -3,6 +3,7 @@ import warnings import grpc +import warnings GRPC_GENERATED_VERSION = '1.73.1' GRPC_VERSION = grpc.__version__ diff --git a/src/scanoss/cli.py b/src/scanoss/cli.py index b5c2a0b8..b4b3a793 100644 --- a/src/scanoss/cli.py +++ b/src/scanoss/cli.py @@ -59,7 +59,10 @@ from .components import Components from .constants import ( DEFAULT_API_TIMEOUT, + DEFAULT_HFH_DEPTH, + DEFAULT_HFH_MIN_ACCEPTED_SCORE, DEFAULT_HFH_RANK_THRESHOLD, + DEFAULT_HFH_RECURSIVE_THRESHOLD, DEFAULT_POST_SIZE, DEFAULT_RETRY, DEFAULT_TIMEOUT, @@ -869,6 +872,27 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915 help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 ' 'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.', ) + p_folder_scan.add_argument( + '--depth', + type=int, + default=DEFAULT_HFH_DEPTH, + help=f'Defines how deep to scan the root directory (optional - default {DEFAULT_HFH_DEPTH})', + ) + p_folder_scan.add_argument( + '--recursive-threshold', + type=float, + default=DEFAULT_HFH_RECURSIVE_THRESHOLD, + help=f'Minimum score threshold to consider a match (optional - default: {DEFAULT_HFH_RECURSIVE_THRESHOLD})', + ) + p_folder_scan.add_argument( + '--min-accepted-score', + type=float, + default=DEFAULT_HFH_MIN_ACCEPTED_SCORE, + help=( + 'Only show results with a score at or above this threshold ' + f'(optional - default: {DEFAULT_HFH_MIN_ACCEPTED_SCORE})' + ), + ) p_folder_scan.set_defaults(func=folder_hashing_scan) # Sub-command: folder-hash @@ -887,6 +911,12 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915 default='json', help='Result output format (optional - default: json)', ) + p_folder_hash.add_argument( + '--depth', + type=int, + default=DEFAULT_HFH_DEPTH, + help=f'Defines how deep to hash the root directory (optional - default {DEFAULT_HFH_DEPTH})', + ) p_folder_hash.set_defaults(func=folder_hash) # Output options @@ -2456,6 +2486,9 @@ def folder_hashing_scan(parser, args): client=client, scanoss_settings=scanoss_settings, rank_threshold=args.rank_threshold, + depth=args.depth, + recursive_threshold=args.recursive_threshold, + min_accepted_score=args.min_accepted_score, ) if scanner.scan(): @@ -2489,6 +2522,7 @@ def folder_hash(parser, args): scan_dir=args.scan_dir, config=folder_hasher_config, scanoss_settings=scanoss_settings, + depth=args.depth, ) folder_hasher.hash_directory(args.scan_dir) diff --git a/src/scanoss/constants.py b/src/scanoss/constants.py index 92fc15b7..989f2008 100644 --- a/src/scanoss/constants.py +++ b/src/scanoss/constants.py @@ -13,4 +13,7 @@ DEFAULT_API_TIMEOUT = 600 -DEFAULT_HFH_RANK_THRESHOLD = 5 \ No newline at end of file +DEFAULT_HFH_RANK_THRESHOLD = 5 +DEFAULT_HFH_DEPTH = 1 +DEFAULT_HFH_RECURSIVE_THRESHOLD = 0.8 +DEFAULT_HFH_MIN_ACCEPTED_SCORE = 0.15 diff --git a/src/scanoss/file_filters.py b/src/scanoss/file_filters.py index cb8298a8..33595374 100644 --- a/src/scanoss/file_filters.py +++ b/src/scanoss/file_filters.py @@ -269,162 +269,6 @@ 'sqlite3', } -# TODO: For hfh add the .gitignore patterns -DEFAULT_SKIPPED_EXT_HFH = { - '.1', - '.2', - '.3', - '.4', - '.5', - '.6', - '.7', - '.8', - '.9', - '.ac', - '.adoc', - '.am', - '.asciidoc', - '.bmp', - '.build', - '.cfg', - '.chm', - '.class', - '.cmake', - '.cnf', - '.conf', - '.config', - '.contributors', - '.copying', - '.crt', - '.csproj', - '.css', - '.csv', - '.dat', - '.data', - '.dtd', - '.dts', - '.iws', - '.c9', - '.c9revisions', - '.dtsi', - '.dump', - '.eot', - '.eps', - '.geojson', - '.gif', - '.glif', - '.gmo', - '.guess', - '.hex', - '.htm', - '.html', - '.ico', - '.iml', - '.in', - '.inc', - '.info', - '.ini', - '.ipynb', - '.jpeg', - '.jpg', - '.json', - '.jsonld', - '.lock', - '.log', - '.m4', - '.map', - '.md5', - '.meta', - '.mk', - '.mxml', - '.o', - '.otf', - '.out', - '.pbtxt', - '.pdf', - '.pem', - '.phtml', - '.plist', - '.png', - '.prefs', - '.properties', - '.pyc', - '.qdoc', - '.result', - '.rgb', - '.rst', - '.scss', - '.sha', - '.sha1', - '.sha2', - '.sha256', - '.sln', - '.spec', - '.sub', - '.svg', - '.svn-base', - '.tab', - '.template', - '.test', - '.tex', - '.tiff', - '.ttf', - '.txt', - '.utf-8', - '.vim', - '.wav', - '.woff', - '.woff2', - '.xht', - '.xhtml', - '.xml', - '.xpm', - '.xsd', - '.xul', - '.yaml', - '.yml', - '.wfp', - '.editorconfig', - '.dotcover', - '.pid', - '.lcov', - '.egg', - '.manifest', - '.cache', - '.coverage', - '.cover', - '.gem', - '.lst', - '.pickle', - '.pdb', - '.gml', - '.pot', - '.plt', - '.whml', - '.pom', - '.smtml', - '.min.js', - '.mf', - '.base64', - '.s', - '.diff', - '.patch', - '.rules', - # File endings - '-doc', - 'config', - 'news', - 'readme', - 'swiftdoc', - 'texidoc', - 'todo', - 'version', - 'ignore', - 'manifest', - 'sqlite', - 'sqlite3', -} - class FileFilters(ScanossBase): """ @@ -707,9 +551,8 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911 bool: True if file should be skipped, False otherwise """ file_name = os.path.basename(file_rel_path) - + DEFAULT_SKIPPED_EXT_LIST = {} if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_FILES - DEFAULT_SKIPPED_EXT_LIST = DEFAULT_SKIPPED_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT if not self.hidden_files_folders and file_name.startswith('.'): self.print_debug(f'Skipping file: {file_rel_path} (hidden file)') diff --git a/src/scanoss/scanners/folder_hasher.py b/src/scanoss/scanners/folder_hasher.py index ad1bad32..2e516780 100644 --- a/src/scanoss/scanners/folder_hasher.py +++ b/src/scanoss/scanners/folder_hasher.py @@ -6,6 +6,7 @@ from progress.bar import Bar +from scanoss.constants import DEFAULT_HFH_DEPTH from scanoss.file_filters import FileFilters from scanoss.scanoss_settings import ScanossSettings from scanoss.scanossbase import ScanossBase @@ -15,8 +16,6 @@ MINIMUM_FILE_COUNT = 8 MINIMUM_CONCATENATED_NAME_LENGTH = 32 -MAXIMUM_FILE_NAME_LENGTH = 32 - class DirectoryNode: """ @@ -72,6 +71,12 @@ class FolderHasher: It builds a directory tree (DirectoryNode) and computes the associated hash data for the folder. + + Args: + scan_dir (str): The directory to be hashed. + config (FolderHasherConfig): Configuration parameters for the folder hasher. + scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss. + depth (int): How many levels to hash from the root directory (default: 1). """ def __init__( @@ -79,6 +84,7 @@ def __init__( scan_dir: str, config: FolderHasherConfig, scanoss_settings: Optional[ScanossSettings] = None, + depth: int = DEFAULT_HFH_DEPTH, ): self.base = ScanossBase( debug=config.debug, @@ -101,6 +107,7 @@ def __init__( self.scan_dir = scan_dir self.tree = None + self.depth = depth def hash_directory(self, path: str) -> dict: """ @@ -123,7 +130,10 @@ def hash_directory(self, path: str) -> dict: return tree - def _build_root_node(self, path: str) -> DirectoryNode: + def _build_root_node( + self, + path: str, + ) -> DirectoryNode: """ Build a directory tree from the given path with file information. @@ -140,7 +150,7 @@ def _build_root_node(self, path: str) -> DirectoryNode: root_node = DirectoryNode(str(root)) all_files = [ - f for f in root.rglob('*') if f.is_file() and len(f.name.encode('utf-8')) <= MAXIMUM_FILE_NAME_LENGTH + f for f in root.rglob('*') if f.is_file() ] filtered_files = self.file_filters.get_filtered_files_from_files(all_files, str(root)) @@ -180,7 +190,7 @@ def _build_root_node(self, path: str) -> DirectoryNode: bar.finish() return root_node - def _hash_calc_from_node(self, node: DirectoryNode) -> dict: + def _hash_calc_from_node(self, node: DirectoryNode, current_depth: int = 1) -> dict: """ Recursively compute folder hash data for a directory node. @@ -189,12 +199,13 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict: Args: node (DirectoryNode): The directory node to compute the hash for. + current_depth (int): The current depth level (1-based, root is depth 1). Returns: dict: The computed hash data for the node. """ hash_data = self._hash_calc(node) - + # Safely calculate relative path try: node_path = Path(node.path).resolve() @@ -204,13 +215,18 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict: # If relative_to fails, use the node path as is or a fallback rel_path = Path(node.path).name if node.path else Path('.') + # Only process children if we haven't reached the depth limit + children = [] + if current_depth < self.depth: + children = [self._hash_calc_from_node(child, current_depth + 1) for child in node.children.values()] + return { 'path_id': str(rel_path), 'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None, 'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None, 'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None, 'lang_extensions': hash_data['lang_extensions'], - 'children': [self._hash_calc_from_node(child) for child in node.children.values()], + 'children': children, } def _hash_calc(self, node: DirectoryNode) -> dict: @@ -237,8 +253,6 @@ def _hash_calc(self, node: DirectoryNode) -> dict: for file in node.files: key_str = file.key_str - if key_str in processed_hashes: - continue file_name = os.path.basename(file.path) diff --git a/src/scanoss/scanners/scanner_hfh.py b/src/scanoss/scanners/scanner_hfh.py index 9f4df38c..2418d4db 100644 --- a/src/scanoss/scanners/scanner_hfh.py +++ b/src/scanoss/scanners/scanner_hfh.py @@ -29,7 +29,12 @@ from progress.spinner import Spinner -from scanoss.constants import DEFAULT_HFH_RANK_THRESHOLD +from scanoss.constants import ( + DEFAULT_HFH_DEPTH, + DEFAULT_HFH_MIN_ACCEPTED_SCORE, + DEFAULT_HFH_RANK_THRESHOLD, + DEFAULT_HFH_RECURSIVE_THRESHOLD, +) from scanoss.cyclonedx import CycloneDx from scanoss.file_filters import FileFilters from scanoss.scanners.folder_hasher import FolderHasher @@ -48,13 +53,16 @@ class ScannerHFH: and calculates simhash values based on file names and content to detect folder-level similarities. """ - def __init__( + def __init__( # noqa: PLR0913 self, scan_dir: str, config: ScannerConfig, client: Optional[ScanossGrpc] = None, scanoss_settings: Optional[ScanossSettings] = None, rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD, + depth: int = DEFAULT_HFH_DEPTH, + recursive_threshold: float = DEFAULT_HFH_RECURSIVE_THRESHOLD, + min_accepted_score: float = DEFAULT_HFH_MIN_ACCEPTED_SCORE, ): """ Initialize the ScannerHFH. @@ -65,6 +73,9 @@ def __init__( client (ScanossGrpc): gRPC client for communicating with the scanning service. scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss. rank_threshold (int): Get results with rank below this threshold (default: 5). + depth (int): How many levels to scan (default: 1). + recursive_threshold (float): Minimum score threshold to consider a match (default: 0.25). + min_accepted_score (float): Only show results with a score at or above this threshold (default: 0.15). """ self.base = ScanossBase( debug=config.debug, @@ -87,12 +98,15 @@ def __init__( scan_dir=scan_dir, config=config, scanoss_settings=scanoss_settings, + depth=depth, ) self.scan_dir = scan_dir self.client = client self.scan_results = None self.rank_threshold = rank_threshold + self.recursive_threshold = recursive_threshold + self.min_accepted_score = min_accepted_score def scan(self) -> Optional[Dict]: """ @@ -102,8 +116,10 @@ def scan(self) -> Optional[Dict]: Optional[Dict]: The folder hash response from the gRPC client, or None if an error occurs. """ hfh_request = { - 'root': self.folder_hasher.hash_directory(self.scan_dir), + 'root': self.folder_hasher.hash_directory(path=self.scan_dir), 'rank_threshold': self.rank_threshold, + 'recursive_threshold': self.recursive_threshold, + 'min_accepted_score': self.min_accepted_score, } spinner = Spinner('Scanning folder...') @@ -193,7 +209,7 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911 } ] } - + get_vulnerabilities_json_request = { 'purls': [{'purl': purl, 'requirement': best_match_version['version']}], } @@ -210,10 +226,10 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911 error_msg = 'ERROR: Failed to produce CycloneDX output' self.base.print_stderr(error_msg) return None - + if vulnerabilities: cdx_output = cdx.append_vulnerabilities(cdx_output, vulnerabilities, purl) - + return json.dumps(cdx_output, indent=2) except Exception as e: self.base.print_stderr(f'ERROR: Failed to get license information: {e}')