Skip to content

Commit afc8cb6

Browse files
matiasdaloiacoresoftware dev
andauthored
[SP-2991] feat: add recursive search support for folder hashing (#141)
* [SP-2991] feat: add depth and min-cutoff-threshold arguments to folder hashing commands * [SP-2991] chore: update grpc definitions * [SP-2991] chore: update dockerfile, scanoss.json and setup.cfg * remove file extension filters to match go-minr criteria * [SP-3040] fix: remove unused code * [SP-2991]: chore: update changelog and bump version * fix hfh extension filter bug * [SP-2874]: rename to recursive_threshold * [SP-2874]: add min_accepted_score * [SP-2991] chore: update changelog, bump version --------- Co-authored-by: coresoftware dev <coredev@scanoss.com>
1 parent 01b5281 commit afc8cb6

File tree

8 files changed

+93
-175
lines changed

8 files changed

+93
-175
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Upcoming changes...
1111

12+
## [1.36.0] - 2025-10-08
13+
### Added
14+
- Add `--recursive-threshold` argument to folder scan command
15+
- Add `--depth` argument to `folder-scan` and `folder-hash` commands
16+
1217
## [1.35.0] - 2025-10-07
1318
### Modified
1419
- Use gRPC instead of REST for API calls
@@ -677,3 +682,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
677682
[1.32.0]: https://github.com/scanoss/scanoss.py/compare/v1.31.5...v1.32.0
678683
[1.33.0]: https://github.com/scanoss/scanoss.py/compare/v1.32.0...v1.33.0
679684
[1.34.0]: https://github.com/scanoss/scanoss.py/compare/v1.33.0...v1.34.0
685+
[1.35.0]: https://github.com/scanoss/scanoss.py/compare/v1.34.0...v1.35.0
686+
[1.36.0]: https://github.com/scanoss/scanoss.py/compare/v1.35.0...v1.36.0

src/scanoss/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
THE SOFTWARE.
2323
"""
2424

25-
__version__ = '1.35.0'
25+
__version__ = '1.36.0'

src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import warnings
44

55
import grpc
6+
import warnings
67

78
GRPC_GENERATED_VERSION = '1.73.1'
89
GRPC_VERSION = grpc.__version__

src/scanoss/cli.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@
5959
from .components import Components
6060
from .constants import (
6161
DEFAULT_API_TIMEOUT,
62+
DEFAULT_HFH_DEPTH,
63+
DEFAULT_HFH_MIN_ACCEPTED_SCORE,
6264
DEFAULT_HFH_RANK_THRESHOLD,
65+
DEFAULT_HFH_RECURSIVE_THRESHOLD,
6366
DEFAULT_POST_SIZE,
6467
DEFAULT_RETRY,
6568
DEFAULT_TIMEOUT,
@@ -869,6 +872,27 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
869872
help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 '
870873
'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.',
871874
)
875+
p_folder_scan.add_argument(
876+
'--depth',
877+
type=int,
878+
default=DEFAULT_HFH_DEPTH,
879+
help=f'Defines how deep to scan the root directory (optional - default {DEFAULT_HFH_DEPTH})',
880+
)
881+
p_folder_scan.add_argument(
882+
'--recursive-threshold',
883+
type=float,
884+
default=DEFAULT_HFH_RECURSIVE_THRESHOLD,
885+
help=f'Minimum score threshold to consider a match (optional - default: {DEFAULT_HFH_RECURSIVE_THRESHOLD})',
886+
)
887+
p_folder_scan.add_argument(
888+
'--min-accepted-score',
889+
type=float,
890+
default=DEFAULT_HFH_MIN_ACCEPTED_SCORE,
891+
help=(
892+
'Only show results with a score at or above this threshold '
893+
f'(optional - default: {DEFAULT_HFH_MIN_ACCEPTED_SCORE})'
894+
),
895+
)
872896
p_folder_scan.set_defaults(func=folder_hashing_scan)
873897

874898
# Sub-command: folder-hash
@@ -887,6 +911,12 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
887911
default='json',
888912
help='Result output format (optional - default: json)',
889913
)
914+
p_folder_hash.add_argument(
915+
'--depth',
916+
type=int,
917+
default=DEFAULT_HFH_DEPTH,
918+
help=f'Defines how deep to hash the root directory (optional - default {DEFAULT_HFH_DEPTH})',
919+
)
890920
p_folder_hash.set_defaults(func=folder_hash)
891921

892922
# Output options
@@ -2456,6 +2486,9 @@ def folder_hashing_scan(parser, args):
24562486
client=client,
24572487
scanoss_settings=scanoss_settings,
24582488
rank_threshold=args.rank_threshold,
2489+
depth=args.depth,
2490+
recursive_threshold=args.recursive_threshold,
2491+
min_accepted_score=args.min_accepted_score,
24592492
)
24602493

24612494
if scanner.scan():
@@ -2489,6 +2522,7 @@ def folder_hash(parser, args):
24892522
scan_dir=args.scan_dir,
24902523
config=folder_hasher_config,
24912524
scanoss_settings=scanoss_settings,
2525+
depth=args.depth,
24922526
)
24932527

24942528
folder_hasher.hash_directory(args.scan_dir)

src/scanoss/constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@
1313

1414
DEFAULT_API_TIMEOUT = 600
1515

16-
DEFAULT_HFH_RANK_THRESHOLD = 5
16+
DEFAULT_HFH_RANK_THRESHOLD = 5
17+
DEFAULT_HFH_DEPTH = 1
18+
DEFAULT_HFH_RECURSIVE_THRESHOLD = 0.8
19+
DEFAULT_HFH_MIN_ACCEPTED_SCORE = 0.15

src/scanoss/file_filters.py

Lines changed: 1 addition & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -269,162 +269,6 @@
269269
'sqlite3',
270270
}
271271

272-
# TODO: For hfh add the .gitignore patterns
273-
DEFAULT_SKIPPED_EXT_HFH = {
274-
'.1',
275-
'.2',
276-
'.3',
277-
'.4',
278-
'.5',
279-
'.6',
280-
'.7',
281-
'.8',
282-
'.9',
283-
'.ac',
284-
'.adoc',
285-
'.am',
286-
'.asciidoc',
287-
'.bmp',
288-
'.build',
289-
'.cfg',
290-
'.chm',
291-
'.class',
292-
'.cmake',
293-
'.cnf',
294-
'.conf',
295-
'.config',
296-
'.contributors',
297-
'.copying',
298-
'.crt',
299-
'.csproj',
300-
'.css',
301-
'.csv',
302-
'.dat',
303-
'.data',
304-
'.dtd',
305-
'.dts',
306-
'.iws',
307-
'.c9',
308-
'.c9revisions',
309-
'.dtsi',
310-
'.dump',
311-
'.eot',
312-
'.eps',
313-
'.geojson',
314-
'.gif',
315-
'.glif',
316-
'.gmo',
317-
'.guess',
318-
'.hex',
319-
'.htm',
320-
'.html',
321-
'.ico',
322-
'.iml',
323-
'.in',
324-
'.inc',
325-
'.info',
326-
'.ini',
327-
'.ipynb',
328-
'.jpeg',
329-
'.jpg',
330-
'.json',
331-
'.jsonld',
332-
'.lock',
333-
'.log',
334-
'.m4',
335-
'.map',
336-
'.md5',
337-
'.meta',
338-
'.mk',
339-
'.mxml',
340-
'.o',
341-
'.otf',
342-
'.out',
343-
'.pbtxt',
344-
'.pdf',
345-
'.pem',
346-
'.phtml',
347-
'.plist',
348-
'.png',
349-
'.prefs',
350-
'.properties',
351-
'.pyc',
352-
'.qdoc',
353-
'.result',
354-
'.rgb',
355-
'.rst',
356-
'.scss',
357-
'.sha',
358-
'.sha1',
359-
'.sha2',
360-
'.sha256',
361-
'.sln',
362-
'.spec',
363-
'.sub',
364-
'.svg',
365-
'.svn-base',
366-
'.tab',
367-
'.template',
368-
'.test',
369-
'.tex',
370-
'.tiff',
371-
'.ttf',
372-
'.txt',
373-
'.utf-8',
374-
'.vim',
375-
'.wav',
376-
'.woff',
377-
'.woff2',
378-
'.xht',
379-
'.xhtml',
380-
'.xml',
381-
'.xpm',
382-
'.xsd',
383-
'.xul',
384-
'.yaml',
385-
'.yml',
386-
'.wfp',
387-
'.editorconfig',
388-
'.dotcover',
389-
'.pid',
390-
'.lcov',
391-
'.egg',
392-
'.manifest',
393-
'.cache',
394-
'.coverage',
395-
'.cover',
396-
'.gem',
397-
'.lst',
398-
'.pickle',
399-
'.pdb',
400-
'.gml',
401-
'.pot',
402-
'.plt',
403-
'.whml',
404-
'.pom',
405-
'.smtml',
406-
'.min.js',
407-
'.mf',
408-
'.base64',
409-
'.s',
410-
'.diff',
411-
'.patch',
412-
'.rules',
413-
# File endings
414-
'-doc',
415-
'config',
416-
'news',
417-
'readme',
418-
'swiftdoc',
419-
'texidoc',
420-
'todo',
421-
'version',
422-
'ignore',
423-
'manifest',
424-
'sqlite',
425-
'sqlite3',
426-
}
427-
428272

429273
class FileFilters(ScanossBase):
430274
"""
@@ -707,9 +551,8 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
707551
bool: True if file should be skipped, False otherwise
708552
"""
709553
file_name = os.path.basename(file_rel_path)
710-
554+
DEFAULT_SKIPPED_EXT_LIST = {} if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
711555
DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_FILES
712-
DEFAULT_SKIPPED_EXT_LIST = DEFAULT_SKIPPED_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
713556

714557
if not self.hidden_files_folders and file_name.startswith('.'):
715558
self.print_debug(f'Skipping file: {file_rel_path} (hidden file)')

0 commit comments

Comments
 (0)