@@ -35,7 +35,7 @@ class DirectoryFile:
3535 Represents a file in the directory tree for folder hashing.
3636 """
3737
38- def __init__ (self , path : str , key : bytes , key_str : str ):
38+ def __init__ (self , path : str , key : List [ bytes ] , key_str : str ):
3939 self .path = path
4040 self .key = key
4141 self .key_str = key_str
@@ -77,7 +77,7 @@ class FolderHasher:
7777 def __init__ (
7878 self ,
7979 scan_dir : str ,
80- config : Optional [ FolderHasherConfig ] = None ,
80+ config : FolderHasherConfig ,
8181 scanoss_settings : Optional [ScanossSettings ] = None ,
8282 ):
8383 self .base = ScanossBase (
@@ -199,6 +199,7 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
199199 'path_id' : node .path ,
200200 'sim_hash_names' : f'{ hash_data ["name_hash" ]:02x} ' if hash_data ['name_hash' ] is not None else None ,
201201 'sim_hash_content' : f'{ hash_data ["content_hash" ]:02x} ' if hash_data ['content_hash' ] is not None else None ,
202+ 'sim_hash_dir' : f'{ hash_data ["dir_hash" ]:02x} ' if hash_data ['dir_hash' ] is not None else None ,
202203 'children' : [self ._hash_calc_from_node (child ) for child in node .children .values ()],
203204 }
204205
@@ -218,44 +219,57 @@ def _hash_calc(self, node: DirectoryNode) -> dict:
218219 dict: A dictionary with 'name_hash' and 'content_hash' keys.
219220 """
220221 processed_hashes = set ()
222+ unique_file_names = set ()
223+ unique_directories = set ()
221224 file_hashes = []
222225 selected_names = []
223226
224227 for file in node .files :
225228 key_str = file .key_str
226229 if key_str in processed_hashes :
227230 continue
228- processed_hashes .add (key_str )
229231
230- selected_names .append (os .path .basename (file .path ))
232+ file_name = os .path .basename (file .path )
233+ file_name_without_extension , _ = os .path .splitext (file_name )
234+ current_directory = os .path .dirname (file .path )
235+
236+ last_directory = os .path .basename (current_directory )
231237
232- file_key = bytes (file .key )
233- file_hashes .append (file_key )
238+ if last_directory == '' :
239+ last_directory = os .path .basename (os .getcwd ())
240+
241+ processed_hashes .add (key_str )
242+ unique_file_names .add (file_name_without_extension )
243+ unique_directories .add (last_directory )
244+ selected_names .append (file_name )
245+ file_hashes .append (file .key )
234246
235247 if len (selected_names ) < MINIMUM_FILE_COUNT :
236- return {
237- 'name_hash' : None ,
238- 'content_hash' : None ,
239- }
248+ return {'name_hash' : None , 'content_hash' : None , 'dir_hash' : None }
240249
241250 selected_names .sort ()
242251 concatenated_names = '' .join (selected_names )
243252
244253 if len (concatenated_names .encode ('utf-8' )) < MINIMUM_CONCATENATED_NAME_LENGTH :
245- return {
246- 'name_hash' : None ,
247- 'content_hash' : None ,
248- }
254+ return {'name_hash' : None , 'content_hash' : None , 'dir_hash' : None }
255+
256+ # Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
257+ unique_file_names_list = list (unique_file_names )
258+ unique_file_names_list .sort ()
259+ concatenated_names = ' ' .join (unique_file_names_list )
260+
261+ # We do the same for the directory names, adding a space and sorting them alphabetically
262+ unique_directories_list = list (unique_directories )
263+ unique_directories_list .sort ()
264+ concatenated_directories = ' ' .join (unique_directories_list )
249265
250266 names_simhash = simhash (WordFeatureSet (concatenated_names .encode ('utf-8' )))
267+ dir_simhash = simhash (WordFeatureSet (concatenated_directories .encode ('utf-8' )))
251268 content_simhash = fingerprint (vectorize_bytes (file_hashes ))
252269
253- return {
254- 'name_hash' : names_simhash ,
255- 'content_hash' : content_simhash ,
256- }
270+ return {'name_hash' : names_simhash , 'content_hash' : content_simhash , 'dir_hash' : dir_simhash }
257271
258- def present (self , output_format : str = None , output_file : str = None ):
272+ def present (self , output_format : Optional [ str ] = None , output_file : Optional [ str ] = None ):
259273 """Present the hashed tree in the selected format"""
260274 self .presenter .present (output_format = output_format , output_file = output_file )
261275
0 commit comments