Skip to content

Commit 5cca6f3

Browse files
committed
[SP-2587] Add directory simhash, modify concatenated names to remove extensions
1 parent 5ad4793 commit 5cca6f3

File tree

2 files changed

+40
-21
lines changed

2 files changed

+40
-21
lines changed

src/scanoss/file_filters.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import os
2626
import sys
2727
from pathlib import Path
28-
from typing import List
28+
from typing import List, Optional
2929

3030
from pathspec import GitIgnoreSpec
3131

@@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]:
511511
# Now filter the files and return the reduced list
512512
return self.get_filtered_files_from_files(all_files, str(root_path))
513513

514-
def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
514+
def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
515515
"""
516516
Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.
517517
@@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
615615
# Default patterns for skipping directories
616616
if not self.all_folders:
617617
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618+
DEFAULT_SKIPPED_DIR_EXT_LIST = (
619+
DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
620+
)
618621
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
619622
patterns.append(f'{dir_name}/')
623+
for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
624+
patterns.append(f'*{dir_extension}/')
620625

621626
# Custom patterns added in SCANOSS settings file
622627
if self.scanoss_settings:

src/scanoss/scanners/folder_hasher.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class DirectoryFile:
3535
Represents a file in the directory tree for folder hashing.
3636
"""
3737

38-
def __init__(self, path: str, key: bytes, key_str: str):
38+
def __init__(self, path: str, key: List[bytes], key_str: str):
3939
self.path = path
4040
self.key = key
4141
self.key_str = key_str
@@ -77,7 +77,7 @@ class FolderHasher:
7777
def __init__(
7878
self,
7979
scan_dir: str,
80-
config: Optional[FolderHasherConfig] = None,
80+
config: FolderHasherConfig,
8181
scanoss_settings: Optional[ScanossSettings] = None,
8282
):
8383
self.base = ScanossBase(
@@ -199,6 +199,7 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
199199
'path_id': node.path,
200200
'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
201201
'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
202+
'sim_hash_dir': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
202203
'children': [self._hash_calc_from_node(child) for child in node.children.values()],
203204
}
204205

@@ -218,44 +219,57 @@ def _hash_calc(self, node: DirectoryNode) -> dict:
218219
dict: A dictionary with 'name_hash' and 'content_hash' keys.
219220
"""
220221
processed_hashes = set()
222+
unique_file_names = set()
223+
unique_directories = set()
221224
file_hashes = []
222225
selected_names = []
223226

224227
for file in node.files:
225228
key_str = file.key_str
226229
if key_str in processed_hashes:
227230
continue
228-
processed_hashes.add(key_str)
229231

230-
selected_names.append(os.path.basename(file.path))
232+
file_name = os.path.basename(file.path)
233+
file_name_without_extension, _ = os.path.splitext(file_name)
234+
current_directory = os.path.dirname(file.path)
235+
236+
last_directory = os.path.basename(current_directory)
231237

232-
file_key = bytes(file.key)
233-
file_hashes.append(file_key)
238+
if last_directory == '':
239+
last_directory = os.path.basename(os.getcwd())
240+
241+
processed_hashes.add(key_str)
242+
unique_file_names.add(file_name_without_extension)
243+
unique_directories.add(last_directory)
244+
selected_names.append(file_name)
245+
file_hashes.append(file.key)
234246

235247
if len(selected_names) < MINIMUM_FILE_COUNT:
236-
return {
237-
'name_hash': None,
238-
'content_hash': None,
239-
}
248+
return {'name_hash': None, 'content_hash': None, 'dir_hash': None}
240249

241250
selected_names.sort()
242251
concatenated_names = ''.join(selected_names)
243252

244253
if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH:
245-
return {
246-
'name_hash': None,
247-
'content_hash': None,
248-
}
254+
return {'name_hash': None, 'content_hash': None, 'dir_hash': None}
255+
256+
# Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
257+
unique_file_names_list = list(unique_file_names)
258+
unique_file_names_list.sort()
259+
concatenated_names = ' '.join(unique_file_names_list)
260+
261+
# We do the same for the directory names, adding a space and sorting them alphabetically
262+
unique_directories_list = list(unique_directories)
263+
unique_directories_list.sort()
264+
concatenated_directories = ' '.join(unique_directories_list)
249265

250266
names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8')))
267+
dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8')))
251268
content_simhash = fingerprint(vectorize_bytes(file_hashes))
252269

253-
return {
254-
'name_hash': names_simhash,
255-
'content_hash': content_simhash,
256-
}
270+
return {'name_hash': names_simhash, 'content_hash': content_simhash, 'dir_hash': dir_simhash}
257271

258-
def present(self, output_format: str = None, output_file: str = None):
272+
def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None):
259273
"""Present the hashed tree in the selected format"""
260274
self.presenter.present(output_format=output_format, output_file=output_file)
261275

0 commit comments

Comments
 (0)