diff --git a/lib/bap_byteweight/bap_byteweight.ml b/lib/bap_byteweight/bap_byteweight.ml index 0ef151e3e..d4fa971f9 100644 --- a/lib/bap_byteweight/bap_byteweight.ml +++ b/lib/bap_byteweight/bap_byteweight.ml @@ -101,7 +101,7 @@ module Make2 end module Bytes = struct - include Make2(struct + module Self = Make2(struct type t = mem type key = mem @@ -114,6 +114,16 @@ module Bytes = struct | _ -> None end)(Memory.Trie.Stable.V1.R8) + let t = Bap_byteweight_signatures.Data.declare "bytes" + ~load:(fun bytes -> + Binable.of_string (module Self) + (Caml.Bytes.unsafe_to_string bytes)) + ~save:(fun data -> + Caml.Bytes.unsafe_of_string @@ + Binable.to_string (module Self) data) + + include Self + let find bw ~length ~threshold mem = let start = Memory.min_addr mem in diff --git a/lib/bap_byteweight/bap_byteweight.mli b/lib/bap_byteweight/bap_byteweight.mli index a704072ba..2a007245b 100644 --- a/lib/bap_byteweight/bap_byteweight.mli +++ b/lib/bap_byteweight/bap_byteweight.mli @@ -155,6 +155,8 @@ module Bytes : sig and type corpus = mem and type token := word + val t : t Bap_byteweight_signatures.data + (** [find mem ~length ~threshold corpus] extract addresses of all memory chunks of the specified [length], that were classified diff --git a/lib/bap_byteweight/bap_byteweight_signatures.ml b/lib/bap_byteweight/bap_byteweight_signatures.ml index c2df82365..7f92f294f 100644 --- a/lib/bap_byteweight/bap_byteweight_signatures.ml +++ b/lib/bap_byteweight/bap_byteweight_signatures.ml @@ -1,7 +1,6 @@ open Core_kernel -open Regular.Std +open Bap_core_theory open Bap.Std -include Self() module Config = Bap_main.Extension.Configuration @@ -14,14 +13,62 @@ type error = [ | `Sys_error of string ] + +type 'a data = { + name : string; + load : (bytes -> 'a); + save : ('a -> bytes); +} + exception Failed of error let fail error = raise (Failed error) -let zip_error entry err = - fail (`Corrupted (sprintf "%s: %s" entry err)) - -let entry ?(comp="default") ~mode arch = - Arch.to_string arch / comp / mode +let corrupted entry err = `Corrupted (sprintf "%s: %s" entry err) +let zip_error entry err = fail (corrupted entry err) + +let compiler_name = + Option.value_map ~default:"default" ~f: Theory.Compiler.name + +let matches_modulo_bits t name = + match Theory.Target.matching t name with + | None -> false + | Some t' -> Theory.Target.bits t = Theory.Target.bits t' + +let matching_entry ?compiler target data {Zip.filename} = + match String.split filename ~on:'/' with + | [p1; p2; p3] -> + matches_modulo_bits target p1 && + String.equal (compiler_name compiler) p2 && + String.equal data.name p3 + | _ -> fail (`Corrupted ("invalid entry name: " ^ filename)) + +let with_input file k = + let zip = Zip.open_in file in + protect ~finally:(fun () -> Zip.close_in zip) ~f:(fun () -> k zip) + +let with_output file k = + let zip = Zip.open_out file in + protect ~finally:(fun () -> Zip.close_out zip) ~f:(fun () -> k zip) + +let read_entry ?compiler target data file = + with_input file @@ fun zip -> + Zip.entries zip |> + List.find ~f:(matching_entry ?compiler target data) |> function + | None -> None + | Some entry -> + Some (data.load (Bytes.of_string (Zip.read_entry zip entry))) + +let read_entries file = + if Fn.non Sys.file_exists file then [] + else with_input file @@ fun zip -> + Zip.entries zip |> + List.map ~f:(fun entry -> + entry,Zip.read_entry zip entry) + +let target_name = Fn.compose KB.Name.unqualified Theory.Target.name + +let make_entry ?compiler target data = + target_name target / compiler_name compiler / data.name let make_path root = root / "signatures" / "byteweight.zip" @@ -31,14 +78,88 @@ let default_path = match Sys.getenv_opt "BAP_SIGFILE" with | Some path -> path | None -> make_path Config.datadir -let paths = [default_path; system_path] - -let resolve_path user = match user with +let default_paths = [default_path; system_path] + +let try_lookup ?(paths=[]) ?compiler target data = + paths @ default_paths |> List.find_map ~f:(fun path -> + if Sys.file_exists path + then read_entry ?compiler target data path + else None) + +let of_exn = function + | Sys_error msg -> Error (`Sys_error msg) + | Zip.Error (_,ent,err) -> Error (corrupted ent err) + | Failed er -> Error er + | other -> raise other + +let lookup ?paths ?compiler target data = + match try_lookup ?paths ?compiler target data with + | exception exn -> of_exn exn + | None -> Error (`No_entry (target_name target)) + | Some data -> Ok data + + +let update_or_fail ?compiler target data payload path = + let entries = + read_entries path |> + List.filter ~f:(fun (entry,_) -> + not (matching_entry ?compiler target data entry)) in + with_output path @@ fun zip -> + let path = make_entry ?compiler target data in + let data = Bytes.unsafe_to_string (data.save payload) in + Zip.add_entry data zip path; + List.iter entries ~f:(fun ({Zip.filename; extra; comment; mtime},data) -> + Zip.add_entry data zip filename + ~extra ~comment ~mtime) + +let copy input output = + let len = 0x1000 in + let buf = Bytes.create len in + let rec loop () = + let read = In_channel.input input ~buf ~pos:0 ~len in + Out_channel.output output ~buf ~pos:0 ~len:read; + if read = len then loop () in + loop () + +let temporary_copy file = + let tmp,output = Caml.Filename.open_temp_file "byteweight" "copy" in + In_channel.with_file file ~f:(fun input -> copy input output); + Out_channel.close output; + tmp + +let update ?compiler target data payload path = + let tmp = temporary_copy path in + try + update_or_fail ?compiler target data payload path; + Sys.rename tmp path; + Ok () + with exn -> + Sys.remove tmp; + of_exn exn + +module Data = struct + let registry = Hash_set.create (module String) + + let declare ~load ~save name = + if Hash_set.mem registry name + then failwithf "The byteweight data type named %S is \ + already registered, please pick another name" + name (); + Hash_set.add registry name; + {load; save; name} +end + +(* the old deprecated implementation *) + +let resolve_path user = + let user = Option.value_map user ~f:List.return ~default:[] in + let paths = user @ default_paths in + match List.find paths ~f:Sys.file_exists with + | None -> fail `No_signatures | Some path -> path - | None -> - match List.find paths ~f:Sys.file_exists with - | Some path -> path - | None -> fail `No_signatures + +let entry ?(comp="default") ~mode arch = + Arch.to_string arch / comp / mode let load_exn ?comp ?path ~mode arch = let path = resolve_path path in @@ -48,7 +169,7 @@ let load_exn ?comp ?path ~mode arch = let entry_path = entry ?comp ~mode arch in let r = try let entry = Zip.find_entry zip entry_path in - Ok (Zip.read_entry zip entry |> Bytes.of_string) + Ok (Zip.read_entry zip entry |> Caml.Bytes.unsafe_of_string) with Caml.Not_found -> fail (`No_entry entry_path) | Zip.Error (_,ent,err) -> zip_error ent err in Zip.close_in zip; diff --git a/lib/bap_byteweight/bap_byteweight_signatures.mli b/lib/bap_byteweight/bap_byteweight_signatures.mli index de48633b9..bf75ca500 100644 --- a/lib/bap_byteweight/bap_byteweight_signatures.mli +++ b/lib/bap_byteweight/bap_byteweight_signatures.mli @@ -1,6 +1,17 @@ -(** Provides signatures storage *) +(** Interface to the unified storage of signatures. + + The signatures a key-value pairs (entries) located in one or more + archives. Keys are target/compiler descriptions and values are + arbitrary data. + + The data types of the signature are described with the [Data] + module. This library doesn't specify any data types of signature + values and they are commonly provided by the libraries that define + those data types, e.g., [Bap_byteweight.Bytes]. +*) + open Core_kernel -open Regular.Std +open Bap_core_theory open Bap.Std (** Error conditions *) @@ -11,16 +22,77 @@ type error = [ | `Sys_error of string (** System error has occurred *) ] +(** the descriptor of the data type stored in the signature entry. + + @since 2.5.0 +*) +type 'a data + + +(** [lookup t f] looks up for the matching entry in the signature database. + + The search is performed over the [paths] list that is a list of + filenames. The first matching entry is selected. If a file in the + [paths] list doesn't exist then it is skipped. If it exists but + unreadable an error is returned. + + The paths list is always appended by [[default_path; system_path]], + in that specific order. + + If [compiler] is specified, then only entries that list matching + compiler will be selected. + + The target matches are performed with the [Theory.Target.matches] + function. + + @since 2.5.0 +*) +val lookup : + ?paths:string list -> + ?compiler:Theory.compiler -> + Theory.Target.t -> 'a data -> ('a, error) Result.t + + +(** [update t f x path] updates or creates an entry in the signature database. + + Removes all entries that match with the specified compiler, + target, and data type and adds a new entry with the provided + data. All unmatching entries are preserved. + + @since 2.5.0 +*) +val update : + ?compiler:Theory.compiler -> + Theory.Target.t -> 'a data -> 'a -> string -> (unit,error) Result.t + + +(** Interface for declaring signature database data types. *) +module Data : sig + + (** [declare ~load ~save name] declares a new mode. + + The [load] and [save] functions are used to store the mode + information in the signatures database. + + Raises an exception if the mode name is not unique. + *) + val declare : + load:(bytes -> 'a) -> + save:('a -> bytes) -> + string -> 'a data +end (** [save ?comp ~mode ~path arch data] store signatures data in the - database of signatures specified by the [path] parameter. The - triple [arch-comp-mode] defines a key for the created entry. If an + database of signatures specified by the [path] parameter. + + + The triple [arch-comp-mode] defines a key for the created entry. If an entry with the same name existed, then it would be overwritten with the new data. If the database, doesn't exist, then it will be created and the specified destination.*) val save : ?comp:string -> mode:string -> path:string -> arch -> bytes -> (unit,error) Result.t - +[@@deprecated "since 2022-02 use [lookup]"] (** [load ?comp ?path ~mode arch] finds a signature for the specified [arch-comp-path] triple. @@ -33,6 +105,7 @@ val save : ?comp:string -> mode:string -> path:string -> arch -> bytes -> *) val load : ?comp:string -> ?path:string -> mode:string -> arch -> (bytes,error) Result.t +[@@deprecated "since 2022-02 use [update]"] (** default path for the user's signatures database. diff --git a/oasis/byteweight b/oasis/byteweight index 24aba402b..f7c1b255d 100644 --- a/oasis/byteweight +++ b/oasis/byteweight @@ -8,14 +8,17 @@ Library bap_byteweight Build$: flag(everything) || flag(byteweight) CompiledObject: best Modules: Bap_byteweight, Bap_byteweight_signatures - BuildDepends: bap, bap-main, core_kernel, uri, regular, camlzip, ppx_bap + BuildDepends: bap, bap-main, bap-core-theory, bap-knowledge, + core_kernel, uri, camlzip, ppx_bap Library byteweight_plugin Path: plugins/byteweight FindlibName: bap-plugin-byteweight Build$: flag(everything) || flag(byteweight) CompiledObject: best - BuildDepends: bap, bap-byteweight, core_kernel, regular, ppx_bap, bap-future + BuildDepends: bap, bap-byteweight, core_kernel, ppx_bap, + bitvec, bitvec-order, + bap-knowledge, bap-core-theory, bap-main InternalModules: Byteweight_main XMETADescription: find function starts using Byteweight algorithm XMETAExtraLines: tags="pass, rooter" diff --git a/plugins/byteweight/.merlin b/plugins/byteweight/.merlin index c7087f785..16b3eec9a 100644 --- a/plugins/byteweight/.merlin +++ b/plugins/byteweight/.merlin @@ -1,3 +1,3 @@ -PKG cmdliner REC -B ../../_build/lib/bap_byteweight \ No newline at end of file +B ../../_build/lib/bap_byteweight +B ../../lib/bap_byteweight \ No newline at end of file diff --git a/plugins/byteweight/byteweight_main.ml b/plugins/byteweight/byteweight_main.ml index 05e30699a..1dae80945 100644 --- a/plugins/byteweight/byteweight_main.ml +++ b/plugins/byteweight/byteweight_main.ml @@ -1,143 +1,152 @@ +let doc = {| +# DESCRIPTION + +Identifies function starts using a predefined sets of function start +signatures. Each signature is a sequence of bytes equipped with a +sample probability of occuring it as a function start. The input +memory is scanned, and for each byte that is not yet classified as a +function start the longest sequence of bytes is searched in the +signatures. If one is found, then the $(b,threshold) parameter defines +the decision procedure. If it is a value below $(b,1.0) then the +sequence of bytes will be classified as a function start if the the +associated probability is higher than the specified threshold. If the +threshold is greater or equal than 1.0, then the sequence of bytes +will be classified as a function start if the Bayes factor of the two +competing hypothesis is greater than the specified threshold. The +Bayes factor is the ratio between the posterior probabilities of the +competing hypothesis. Therefore, it includes the prior odds of finding +a function start, which makes the hypothesis testing more robust. The +Bayes factor value is having the following interpretations: + +``` + Bayes Factor Strength + + 1 to 3.2 Weak + 3.2 to 10 Substantial + 10 to 100 Strong + 100 and greater Decisive; +``` + +This plugin is a partial implementation of the starts, partially +BYTEWEIGHT algorithm as described in [1]. Only the byte level matching +is implemented. The $(b,SEE ALSO) section contains links to other +plugins, that provide function identification services. + +[1]: Bao, Tiffany, et al. "Byteweight: Learning to recognize +functions in binary code." 23rd USENIX Security Symposium (USENIX +Security 14). 2014. + +# SEE ALSO + +$(b,bap-byteweight)(1), $(b,bap-plugin-ida)(1), $(b,bap-plugin-read-symbols)(1) +|} + open Core_kernel +open Bap_main +open Bap_core_theory open Bap.Std -open Format -open Bap_future.Std -include Self() +open KB.Syntax +include Loggers() module BW = Bap_byteweight.Bytes module Sigs = Bap_byteweight_signatures -module Stats = Bap_byteweight.Stats let p1 m n = float m /. float (m + n) and p0 m n = float n /. float (m + n) +let roots = KB.Class.property Theory.Unit.cls "byteweight-roots" + ~package:"bap" @@ KB.Domain.powerset (module Bitvec_order) "roots" + +let no_roots = Set.empty (module Bitvec_order) +let of_addrs = + List.fold ~init:no_roots ~f:(fun roots addr -> + Set.add roots (Addr.to_bitvec addr)) + +let make_compiler compiler unit = match compiler with + | Some name -> KB.return @@ Some (Theory.Compiler.create name) + | None -> unit-->Theory.Unit.compiler -let create_finder path ~min_length ~max_length threshold arch comp = - match Sigs.load ?comp ~path ~mode:"bytes" arch with +let compute_root_table path min_length max_length threshold compiler = + KB.Rule.(begin + declare ~package:"bap" "precompute-byteweight-rules" |> + dynamic ["byteweight signatures"] |> + require Theory.Unit.target |> + require Project.memory_slot |> + require Theory.Unit.compiler |> + provide roots |> + comment "precomputes byteweight roots" + end); + let paths = Option.value_map path ~f:List.return ~default:[] in + KB.promise roots @@ fun unit -> + let* target = unit-->Theory.Unit.target in + let* memory = unit-->Project.memory_slot in + let* compiler = make_compiler compiler unit in + KB.guard (not (Memmap.is_empty memory)) >>| fun () -> + match Sigs.lookup ~paths ?compiler target BW.t with | Error `No_signatures -> - info "function starts signatures are not available"; - info "advice - use `bap-byteweight` to install signatures"; - info "advice - alternatively, use `opam install bap-signatures'"; - Or_error.errorf "signatures are unavailable" - | Error (`Corrupted err) -> - error "function starts signature file is corrupted: %s" err; - info "advice - delete signatures at `%s'" path; - info "advice - use `bap-byteweight` to install signatures"; - info "advice - alternatively, use `opam install bap-signatures'"; - Or_error.errorf "signatures are corrupted" - | Error (`No_entry _) -> - warning "no signatures for the specified compiler and/or architecture"; - info "advice - try to use the default compiler entry"; - info "advice - create new entries using the `bap-byteweight' tool"; - Or_error.errorf "compiler is not supported by signatures" - | Error (`Sys_error err) -> - error "failed to load the signatures because of a system error: %s" err; - Or_error.errorf "system error" - | Ok data -> - let sigs = Binable.of_string (module BW) (Bytes.to_string data) in - Result.return @@ - if Float.(threshold >= 1.0) - then BW.find_using_bayes_factor sigs ~min_length ~max_length threshold - else BW.find_using_threshold sigs ~min_length ~max_length threshold + warning "The signatures database is empty."; + info "install the signatures with `opam install bap-signatures'"; + info "alternatively use `bap-byteweight update'"; + no_roots + | Error `No_entry s -> + info "no signatures for %s" s; + info "use `bap-byteweight train' to create signatures"; + no_roots + | Error (`Sys_error _ | `Corrupted _ as problem) -> + error "the signatures database is broken: %s" + (Sigs.string_of_error problem); + no_roots + | Ok sigs -> + let find = if Float.(threshold >= 1.0) + then BW.find_using_bayes_factor + sigs ~min_length ~max_length threshold + else BW.find_using_threshold + sigs ~min_length ~max_length threshold in + Memmap.to_sequence memory |> + Seq.fold ~init:no_roots ~f:(fun roots (mem,_) -> + Set.union roots @@ of_addrs (find mem)) -let main path min_length max_length threshold comp = - let finder arch = create_finder path threshold arch comp - ~min_length ~max_length in - let find finder mem = - Memmap.to_sequence mem |> - Seq.fold ~init:Addr.Set.empty ~f:(fun roots (mem,_) -> - Set.union roots @@ Addr.Set.of_list (finder mem)) in - let find_roots arch mem = match finder arch with - | Error _ as err -> - warning "will not provide roots"; - err - | Ok finder -> match find finder mem with - | roots when Set.is_empty roots -> - info "no roots were found"; - info "advice - check your signatures"; - Ok (Rooter.create Seq.empty) - | roots -> Ok (roots |> Set.to_sequence |> Rooter.create) in - if Sys.file_exists path then - let args = Stream.Variadic.(begin - args Project.Info.arch $Project.Info.code $Project.Info.file - end) in - Stream.Variadic.apply args ~f:(fun arch mem path -> - match find_roots arch mem with - | Ok roots -> Ok (Rooter.set_path roots path) - | Error err -> Error err) |> fun rooters -> - Stream.observe rooters @@ function - | Ok rooter -> Rooter.provide rooter - | Error _ -> () - - else begin - warning "the signature database is not available"; - info "advice - use `bap-byteweight` to install signatures"; - info "advice - alternatively, use `opam install bap-signatures'"; - end +let provide_roots () = + KB.Rule.(begin + declare ~package:"bap" "byteweight" |> + require roots |> + provide Theory.Label.is_subroutine |> + comment "uses byteweight to find function starts" + end); + KB.promise Theory.Label.is_subroutine @@ fun program -> + let*? unit = program-->Theory.Label.unit in + let*? addr = program-->Theory.Label.addr in + let+ roots = unit-->roots in + Option.some_if (Set.mem roots addr) true +let main path min_length max_length threshold comp = + compute_root_table path min_length max_length threshold comp; + provide_roots () let () = - Config.manpage [ - `S "DESCRIPTION"; - `P {| - Identifies function starts using a predefined sets of function - start signatures. Each signature is a sequence of bytes equipped with - a sample probability of occuring it as a function start. The input - memory is scanned, and for each byte that is not yet classified as a - function start the longest sequence of bytes is searched in the - signatures. If one is found, then the $(b,threshold) parameter defines - the decision procedure. If it is a value below $(b,1.0) then the - sequence of bytes will be classified as a function start if the - the associated probability is higher than the specified threshold. - If the threshold is greater or equal than 1.0, then the sequence of - bytes will be classified as a function start if the Bayes factor of - the two competing hypothesis is greater than the specified - threshold. The Bayes factor is the ratio between the posterior - probabilities of the competing hypothesis. Therefore, it includes - the prior odds of finding a function start, which makes the - hypothesis testing more robust. The Bayes factor value is having the - following interpretations: -|}; - `Pre " - Bayes Factor Strength - - 1 to 3.2 Weak - 3.2 to 10 Substantial - 10 to 100 Strong - 100 and greater Decisive; -"; - - - `P "This plugin is a partial implementation of the starts, partially - BYTEWEIGHT algorithm as described in [1]. Only the byte level - matching is implemented. The $(b,SEE ALSO) section contains - links to other plugins, that provide function identification services."; - - `P "[1]: Bao, Tiffany, et al. \"Byteweight: Learning to recognize - functions in binary code.\" 23rd USENIX Security Symposium (USENIX - Security 14). 2014."; - `S "SEE ALSO"; - `P "$(b,bap-byteweight)(1), $(b,bap-plugin-ida)(1), $(b,bap-plugin-read-symbols)(1)" - ]; - let open Config in - let min_length = param int ~default:8 "min-length" + let open Extension.Configuration in + let open Extension.Type in + let min_length = parameter (int =? 8) "min-length" ~doc:"The minimum length of a word, that could identify a \ function start. Any signatures that are below that \ length, will not be considered, affect prior \ probabilities, etc." in - let max_length = param int ~default:16 "max-length" - ~synonyms:["length"] + let max_length = parameter (int =? 16) "max-length" + ~aliases:["length"] ~doc:"The maximum length of a word, that could identify a \ function start. Any signatures that are greater than that \ length, will not be considered, affect prior \ probabilities, etc." in - let threshold = param float ~default:10. "threshold" + let threshold = parameter (float =? 10.) "threshold" ~doc:"If greater than 1.0 then it is the Bayes factor, \ otherwise it is a probability." in - let sigsfile = param non_dir_file ~default:Sigs.default_path "sigs" + let sigsfile = parameter (some non_dir_file) "sigs" + ~aliases:["signatures"] ~doc:"Path to the signature file" in - let compiler = param (some string) "comp" + let compiler = parameter (some string) "compiler" ~doc:"Assume the input file is compiled by $(docv)" in - Config.when_ready (fun {Config.get=(!)} -> - main !sigsfile !min_length !max_length !threshold !compiler) + Extension.declare ~doc ~provides:["roots"] @@ fun ctxt -> + let (!) p = get ctxt p in + main !sigsfile !min_length !max_length !threshold !compiler; + Ok () diff --git a/src/.merlin b/src/.merlin index 3f3b9f6fe..abf001705 100644 --- a/src/.merlin +++ b/src/.merlin @@ -3,5 +3,8 @@ PKG cmdliner PKG curl PKG parsexp PKG uuidm +PKG re B ../_build/src B ../_build/lib/bap_byteweight +B ../src +B ../lib/bap_byteweight diff --git a/src/bap_byteweight_main.ml b/src/bap_byteweight_main.ml index 455eef968..80f7f87aa 100644 --- a/src/bap_byteweight_main.ml +++ b/src/bap_byteweight_main.ml @@ -74,6 +74,10 @@ module Sigs = Bap_byteweight_signatures module Digest = Caml.Digest module Config = Extension.Configuration +(* we still have to update this tool to support modern bap *) +let sigs_load = Sigs.load[@warning "-D"] +let sigs_save = Sigs.save[@warning "-D"] + type failure = | Image of Error.t | Sigs of Sigs.error @@ -114,7 +118,7 @@ let load_or_create_signatures ?comp ?path operation arch = match operation with | `rewrite -> Ok (BW.create ()) | operation -> - match Sigs.load ?comp ?path ~mode:"bytes" arch, operation with + match sigs_load ?comp ?path ~mode:"bytes" arch, operation with | Ok s,_ -> Ok (Binable.of_string (module BW) (Bytes.to_string s)) | Error (`No_entry _|`No_signatures), `update -> Ok (BW.create ()) @@ -138,7 +142,7 @@ let train_on_file loader comp operation max_length db path = load_or_create_signatures ?comp ~path:db operation arch >>= fun bw -> Seq.iter (code_of_image img) ~f:(BW.train bw ~max_length oracle); with_sigs_error @@ - Sigs.save ?comp ~mode:"bytes" ~path:db arch @@ + sigs_save ?comp ~mode:"bytes" ~path:db arch @@ Bytes.of_string @@ Binable.to_string (module BW) bw