@@ -185,35 +185,40 @@ def _extract_rule_ids(self, path: str, target_files: set[str]) -> set[str]:
185185            If any Semgrep rule file could not be safely loaded, or if their format was not in the expected Semgrep 
186186            format, or if there were any files in 'target_files' not found when searching in 'path'. 
187187        """ 
188-         path_tree  =  glob .glob (os .path .join (path , "**" , "*" ), recursive = True )
189-         all_file_names  =  {os .path .basename (file ) for  file  in  path_tree  if  os .path .isfile (file )}
190-         if  not  target_files .issubset (all_file_names ):
191-             error_msg  =  f"The following semgrep files were not found in { path } { target_files  -  all_file_names }  
188+         # We keep a record of any file paths we coulnd't find to provide a more useful error message, rather than raising 
189+         # an error on the first missing file we see. 
190+         missing_files : list [str ] =  []
191+         target_file_paths : list [str ] =  []
192+         rule_ids : set [str ] =  set ()
193+ 
194+         for  target_file  in  target_files :
195+             file_paths  =  glob .glob (os .path .join (path , "**" , target_file ), recursive = True )
196+             if  not  file_paths :
197+                 missing_files .append (target_file )
198+             target_file_paths .extend (file_paths )
199+ 
200+         if  missing_files :
201+             error_msg  =  f"The following semgrep files were not found in { path } { missing_files }  
192202            logger .debug (error_msg )
193203            raise  ConfigurationError (error_msg )
194204
195-         rule_ids  =  set ()
196-         for  root , _ , files  in  os .walk (path ):
197-             files_found  =  set .intersection (target_files , set (files ))
198-             for  filename  in  files_found :
199-                 semgrep_ruleset_file  =  os .path .join (root , filename )
200- 
201-                 try :
202-                     with  open (semgrep_ruleset_file , encoding = "utf-8" ) as  file :
203-                         semgrep_ruleset : dict [str , list ] =  yaml .safe_load (file .read ())
204-                 except  yaml .YAMLError  as  yaml_error :
205-                     error_msg  =  f"Unable to open semgrep rule file { semgrep_ruleset_file } { yaml_error }  
206-                     logger .debug (error_msg )
207-                     raise  ConfigurationError (error_msg ) from  yaml_error 
208- 
209-                 # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries 
210-                 try :
211-                     for  semgrep_rule  in  semgrep_ruleset ["rules" ]:
212-                         rule_ids .add (semgrep_rule ["id" ])
213-                 except  (KeyError , TypeError ) as  format_error :
214-                     error_msg  =  f"Invalid semgrep rule format for { semgrep_ruleset_file } { format_error }  
215-                     logger .debug (error_msg )
216-                     raise  ConfigurationError (error_msg ) from  format_error 
205+         for  file_path  in  target_file_paths :
206+             try :
207+                 with  open (file_path , encoding = "utf-8" ) as  file :
208+                     semgrep_ruleset : dict [str , list ] =  yaml .safe_load (file .read ())
209+             except  yaml .YAMLError  as  yaml_error :
210+                 error_msg  =  f"Unable to open semgrep rule file { file_path } { yaml_error }  
211+                 logger .debug (error_msg )
212+                 raise  ConfigurationError (error_msg ) from  yaml_error 
213+ 
214+             # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries 
215+             try :
216+                 for  semgrep_rule  in  semgrep_ruleset ["rules" ]:
217+                     rule_ids .add (semgrep_rule ["id" ])
218+             except  (KeyError , TypeError ) as  format_error :
219+                 error_msg  =  f"Invalid semgrep rule format for { file_path } { format_error }  
220+                 logger .debug (error_msg )
221+                 raise  ConfigurationError (error_msg ) from  format_error 
217222
218223        return  rule_ids 
219224
@@ -306,7 +311,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
306311            # e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from 
307312            # the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'. 
308313            if  rule_id .split ("." )[- 1 ] in  self .disabled_rule_ids :
309-                 if  rule_id  not  in self . disabled_rule_ids :
314+                 if  rule_id  not  in disabled_results :
310315                    disabled_results [rule_id ] =  {"message" : message , "detections" : []}
311316                disabled_results [rule_id ]["detections" ].append ({"file" : file , "start" : start , "end" : end })
312317
0 commit comments