4646 'copying.lib' ,
4747 'makefile' ,
4848}
49+
50+ DEFAULT_SKIPPED_FILES_HFH = {
51+ 'gradlew' ,
52+ 'gradlew.bat' ,
53+ 'mvnw' ,
54+ 'mvnw.cmd' ,
55+ 'gradle-wrapper.jar' ,
56+ 'maven-wrapper.jar' ,
57+ 'thumbs.db' ,
58+ 'babel.config.js' ,
59+ }
60+
61+
4962# Folders to skip
5063DEFAULT_SKIPPED_DIRS = {
5164 'nbproject' ,
6679 'test' ,
6780}
6881
82+ DEFAULT_SKIPPED_DIRS_HFH = {
83+ 'nbproject' ,
84+ 'nbbuild' ,
85+ 'nbdist' ,
86+ '__pycache__' ,
87+ 'venv' ,
88+ '_yardoc' ,
89+ 'eggs' ,
90+ 'wheels' ,
91+ 'htmlcov' ,
92+ '__pypackages__' ,
93+ 'example' ,
94+ 'examples' ,
95+ }
96+
6997
7098# Folder endings to skip
7199DEFAULT_SKIPPED_DIR_EXT = {'.egg-info' }
100+ DEFAULT_SKIPPED_DIR_EXT_HFH = {'.egg-info' }
101+
72102# File extensions to skip
73103DEFAULT_SKIPPED_EXT = {
74104 '.1' ,
243273 'sqlite3' ,
244274}
245275
276+ # TODO: For hfh add the .gitignore patterns
277+ DEFAULT_SKIPPED_EXT_HFH = {
278+ '.1' ,
279+ '.2' ,
280+ '.3' ,
281+ '.4' ,
282+ '.5' ,
283+ '.6' ,
284+ '.7' ,
285+ '.8' ,
286+ '.9' ,
287+ '.ac' ,
288+ '.adoc' ,
289+ '.am' ,
290+ '.asciidoc' ,
291+ '.bmp' ,
292+ '.build' ,
293+ '.cfg' ,
294+ '.chm' ,
295+ '.class' ,
296+ '.cmake' ,
297+ '.cnf' ,
298+ '.conf' ,
299+ '.config' ,
300+ '.contributors' ,
301+ '.copying' ,
302+ '.crt' ,
303+ '.csproj' ,
304+ '.css' ,
305+ '.csv' ,
306+ '.dat' ,
307+ '.data' ,
308+ '.dtd' ,
309+ '.dts' ,
310+ '.iws' ,
311+ '.c9' ,
312+ '.c9revisions' ,
313+ '.dtsi' ,
314+ '.dump' ,
315+ '.eot' ,
316+ '.eps' ,
317+ '.geojson' ,
318+ '.gif' ,
319+ '.glif' ,
320+ '.gmo' ,
321+ '.guess' ,
322+ '.hex' ,
323+ '.htm' ,
324+ '.html' ,
325+ '.ico' ,
326+ '.iml' ,
327+ '.in' ,
328+ '.inc' ,
329+ '.info' ,
330+ '.ini' ,
331+ '.ipynb' ,
332+ '.jpeg' ,
333+ '.jpg' ,
334+ '.json' ,
335+ '.jsonld' ,
336+ '.lock' ,
337+ '.log' ,
338+ '.m4' ,
339+ '.map' ,
340+ '.md5' ,
341+ '.meta' ,
342+ '.mk' ,
343+ '.mxml' ,
344+ '.o' ,
345+ '.otf' ,
346+ '.out' ,
347+ '.pbtxt' ,
348+ '.pdf' ,
349+ '.pem' ,
350+ '.phtml' ,
351+ '.plist' ,
352+ '.png' ,
353+ '.prefs' ,
354+ '.properties' ,
355+ '.pyc' ,
356+ '.qdoc' ,
357+ '.result' ,
358+ '.rgb' ,
359+ '.rst' ,
360+ '.scss' ,
361+ '.sha' ,
362+ '.sha1' ,
363+ '.sha2' ,
364+ '.sha256' ,
365+ '.sln' ,
366+ '.spec' ,
367+ '.sub' ,
368+ '.svg' ,
369+ '.svn-base' ,
370+ '.tab' ,
371+ '.template' ,
372+ '.test' ,
373+ '.tex' ,
374+ '.tiff' ,
375+ '.ttf' ,
376+ '.txt' ,
377+ '.utf-8' ,
378+ '.vim' ,
379+ '.wav' ,
380+ '.woff' ,
381+ '.woff2' ,
382+ '.xht' ,
383+ '.xhtml' ,
384+ '.xml' ,
385+ '.xpm' ,
386+ '.xsd' ,
387+ '.xul' ,
388+ '.yaml' ,
389+ '.yml' ,
390+ '.wfp' ,
391+ '.editorconfig' ,
392+ '.dotcover' ,
393+ '.pid' ,
394+ '.lcov' ,
395+ '.egg' ,
396+ '.manifest' ,
397+ '.cache' ,
398+ '.coverage' ,
399+ '.cover' ,
400+ '.gem' ,
401+ '.lst' ,
402+ '.pickle' ,
403+ '.pdb' ,
404+ '.gml' ,
405+ '.pot' ,
406+ '.plt' ,
407+ '.whml' ,
408+ '.pom' ,
409+ '.smtml' ,
410+ '.min.js' ,
411+ '.mf' ,
412+ '.base64' ,
413+ '.s' ,
414+ '.diff' ,
415+ '.patch' ,
416+ '.rules' ,
417+ # File endings
418+ '-doc' ,
419+ 'config' ,
420+ 'news' ,
421+ 'readme' ,
422+ 'swiftdoc' ,
423+ 'texidoc' ,
424+ 'todo' ,
425+ 'version' ,
426+ 'ignore' ,
427+ 'manifest' ,
428+ 'sqlite' ,
429+ 'sqlite3' ,
430+ }
431+
246432
247433class FileFilters (ScanossBase ):
248434 """
@@ -267,6 +453,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
267453 skip_size (int): Size to skip
268454 skip_extensions (list): Extensions to skip
269455 skip_folders (list): Folders to skip
456+ is_folder_hashing_scan (bool): Whether the operation is a folder hashing scan
270457 """
271458 super ().__init__ (debug , trace , quiet )
272459
@@ -277,6 +464,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
277464 self .skip_folders = kwargs .get ('skip_folders' , [])
278465 self .skip_size = kwargs .get ('skip_size' , 0 )
279466 self .skip_extensions = kwargs .get ('skip_extensions' , [])
467+ self .is_folder_hashing_scan = kwargs .get ('is_folder_hashing_scan' , False )
280468 self .file_folder_pat_spec = self ._get_file_folder_pattern_spec (kwargs .get ('operation_type' , 'scanning' ))
281469 self .size_pat_rules = self ._get_size_limit_pattern_rules (kwargs .get ('operation_type' , 'scanning' ))
282470
@@ -336,36 +524,36 @@ def get_filtered_files_from_files(self, files: List[str], scan_root: str = None)
336524 """
337525 filtered_files = []
338526 for file_path in files :
339- if not os .path .exists (file_path ) or not os .path .isfile (file_path ) or os .path .islink (file_path ):
340- self .print_debug (
341- f'WARNING: File { file_path } does not exist, is not a file, or is a symbolic link. Ignoring.'
342- )
343- continue
344-
345527 path_obj = Path (file_path )
346- if not self .hidden_files_folders and any (part .startswith ('.' ) for part in path_obj .parts ):
347- self .print_debug (f'Skipping file: { file_path } (in hidden directory or is hidden file)' )
348- continue
349-
350528 try :
351529 if scan_root :
352- rel_path = os . path . relpath ( file_path , scan_root )
530+ rel_path = path_obj . relative_to ( scan_root )
353531 else :
354- rel_path = os . path . relpath ( file_path )
532+ rel_path = str ( path_obj )
355533 except ValueError :
356- # If file_path is broken, symlink ignore it
357534 self .print_debug (f'Ignoring file: { file_path } (broken symlink)' )
358535 continue
536+
537+ if not path_obj .exists () or not path_obj .is_file () or path_obj .is_symlink ():
538+ self .print_debug (
539+ f'WARNING: File { rel_path } does not exist, is not a file, or is a symbolic link. Ignoring.'
540+ )
541+ continue
542+
543+ if not self .hidden_files_folders and any (part .startswith ('.' ) for part in path_obj .parts ):
544+ self .print_debug (f'Skipping file: { rel_path } (in hidden directory or is hidden file)' )
545+ continue
546+
359547 if self ._should_skip_file (rel_path ):
360548 continue
361549 try :
362- file_size = os . path . getsize ( file_path )
550+ file_size = path_obj . stat (). st_size
363551 if file_size == 0 :
364552 self .print_debug (f'Skipping file: { rel_path } (empty file)' )
365553 continue
366554 min_size , max_size = self ._get_operation_size_limits (file_path )
367555 if min_size <= file_size <= max_size :
368- filtered_files .append (rel_path )
556+ filtered_files .append (str ( rel_path ) )
369557 else :
370558 self .print_debug (
371559 f'Skipping file: { rel_path } (size { file_size } outside limits { min_size } -{ max_size } )'
@@ -379,8 +567,11 @@ def _get_file_folder_pattern_spec(self, operation_type: str = 'scanning'):
379567 """
380568 Get file path pattern specification.
381569
382- :param operation_type: which operation is being performed
383- :return: List of file path patterns
570+ Args:
571+ operation_type (str): Type of operation ('scanning' or 'fingerprinting')
572+
573+ Returns:
574+ GitIgnoreSpec: GitIgnoreSpec object containing the file path patterns
384575 """
385576 patterns = self ._get_operation_patterns (operation_type )
386577 if patterns :
@@ -391,8 +582,11 @@ def _get_size_limit_pattern_rules(self, operation_type: str = 'scanning'):
391582 """
392583 Get size limit pattern rules.
393584
394- :param operation_type: which operation is being performed
395- :return: List of size limit pattern rules
585+ Args:
586+ operation_type (str): Type of operation ('scanning' or 'fingerprinting')
587+
588+ Returns:
589+ List of size limit pattern rules
396590 """
397591 if self .scanoss_settings :
398592 size_rules = self .scanoss_settings .get_skip_sizes (operation_type )
@@ -417,6 +611,14 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
417611 List[str]: Combined list of patterns to skip
418612 """
419613 patterns = []
614+
615+ # Default patterns for skipping directories
616+ if not self .all_folders :
617+ DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618+ for dir_name in DEFAULT_SKIPPED_DIR_LIST :
619+ patterns .append (f'{ dir_name } /' )
620+
621+ # Custom patterns added in SCANOSS settings file
420622 if self .scanoss_settings :
421623 patterns .extend (self .scanoss_settings .get_skip_patterns (operation_type ))
422624 return patterns
@@ -505,18 +707,21 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
505707 """
506708 file_name = os .path .basename (file_rel_path )
507709
710+ DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_FILES
711+ DEFAULT_SKIPPED_EXT_LIST = DEFAULT_SKIPPED_EXT_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
712+
508713 if not self .hidden_files_folders and file_name .startswith ('.' ):
509714 self .print_debug (f'Skipping file: { file_rel_path } (hidden file)' )
510715 return True
511716 if self .all_extensions :
512717 return False
513718 file_name_lower = file_name .lower ()
514719 # Look for exact files
515- if file_name_lower in DEFAULT_SKIPPED_FILES :
720+ if file_name_lower in DEFAULT_SKIPPED_FILES_LIST :
516721 self .print_debug (f'Skipping file: { file_rel_path } (matches default skip file)' )
517722 return True
518723 # Look for file endings
519- for ending in DEFAULT_SKIPPED_EXT :
724+ for ending in DEFAULT_SKIPPED_EXT_LIST :
520725 if file_name_lower .endswith (ending ):
521726 self .print_debug (f'Skipping file: { file_rel_path } (matches default skip ending: { ending } )' )
522727 return True
@@ -531,39 +736,3 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
531736 self .print_debug (f'Skipping file: { file_rel_path } (matches custom pattern)' )
532737 return True
533738 return False
534-
535- def _should_skip_file_for_hfh (self , file_path : Path ) -> bool :
536- """
537- Check if a file should be skipped during folder hashing scan.
538-
539- Args:
540- file_path (Path): The path to the file to check.
541-
542- Returns:
543- bool: True if the file should be skipped, False otherwise.
544- """
545- try :
546- if (
547- any (part .startswith ('.' ) for part in file_path .parts ) # Hidden files/folders
548- or file_path .is_symlink () # Symlinks
549- or file_path .stat ().st_size == 0 # Empty files
550- ):
551- self .print_debug (f'Skipping file: { file_path } (hidden/symlink/empty)' )
552- return True
553-
554- # Files ending with null
555- if file_path .suffix .lower () == '.txt' :
556- try :
557- with open (file_path , 'rb' ) as f :
558- if f .read ().endswith (b'\x00 ' ):
559- self .print_debug (f'Skipping file: { file_path } (text file ending with null)' )
560- return True
561- except (OSError , IOError ):
562- self .print_debug (f'Skipping file: { file_path } (cannot read file content)' )
563- return True
564-
565- return False
566-
567- except Exception as e :
568- self .print_debug (f'Error checking file { file_path } : { str (e )} ' )
569- return True
0 commit comments