diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c405f757..a4ad318e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,4 +43,8 @@ jobs: - run: nix flake check --all-systems --print-build-logs - - run: nix build --print-build-logs .#probe-bundled .#probe-py + - run: nix build --print-build-logs .#probe-bundled + + # The devshell uses slightly different build process than the Nix pkg + # Might as well test that too + - run: nix develop --command just compile fix check test-native diff --git a/.gitignore b/.gitignore index e83ced90..6e713a5c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ **/work **/.pytest_cache **/.idea +**/__pycache__/ # build directories **/target @@ -22,4 +23,4 @@ **/desktop.ini probe_log -dataflow_graph.pkl +.dmypy.json diff --git a/Justfile b/Justfile index ffee3e47..0c8385e0 100644 --- a/Justfile +++ b/Justfile @@ -1,34 +1,51 @@ -fix-format-nix: +fix-nix: alejandra . -fix-ruff: - #ruff format probe_src # TODO: uncomment - ruff check --fix probe_src +fix-py: compile-cli + # fix-py depends on compile-cli for the autogen python code + ruff format probe_py/ tests/ libprobe/generator/ + ruff check --fix probe_py/ tests/ libprobe/generator/ -fix-format-rust: - env --chdir probe_src/frontend cargo fmt +fix-cli: + # cargo clippy refuses to run if unstaged inputs (fixes may be destructive) + # so we git add -A + env --chdir cli-wrapper git add -A + env --chdir cli-wrapper cargo clippy --fix --allow-staged -- --deny warnings + env --chdir cli-wrapper cargo fmt -fix-clippy: - git add -A - env --chdir probe_src/frontend cargo clippy --fix --allow-staged +fix: fix-nix fix-py fix-cli -check-mypy: - mypy --strict probe_src/libprobe - mypy --strict --package probe_py.generated - mypy --strict --package probe_py.manual +check-py: compile-cli + # dmypy == daemon mypy; much faster on subsequent iterations. + dmypy run -- --strict --no-namespace-packages --pretty probe_py/ tests/ libprobe/generator/ + +check-cli: + env --chdir cli-wrapper cargo doc --workspace + +check: check-py check-cli compile-lib: - make --directory=probe_src/libprobe all + make --directory=libprobe all compile-cli: - env --chdir=probe_src/frontend cargo build --release + env --chdir=cli-wrapper cargo build --release + env --chdir=cli-wrapper cargo build compile-tests: - make --directory=probe_src/tests/c all + make --directory=tests/examples all compile: compile-lib compile-cli compile-tests -test-dev: compile - pytest probe_src --failed-first --maxfail=1 +test-nix: + nix build .#probe-bundled + nix flake check --all-systems + +test-native: compile + python -m pytest tests/ -ra --failed-first --maxfail=1 -v + +test: test-native +# Unless you the user explicitly asks (`just test-nix`), +# we don't really need to test-nix. +# It runs the same checks as `just test` and `just check`, but in Nix. -pre-commit: fix-format-nix fix-ruff fix-format-rust fix-clippy compile check-mypy test-dev +pre-commit: fix check compile test diff --git a/README.md b/README.md index 6be12ef6..6354a3d9 100644 --- a/README.md +++ b/README.md @@ -109,30 +109,28 @@ probe export --help 7. **Before submitting a PR**, run `just pre-commit` which will run pre-commit checks. -## Research reading list - -- [_Provenance for Computational Tasks: A Survey_ by Freire, et al. in CiSE 2008](https://sci.utah.edu/~csilva/papers/cise2008a.pdf) for an overview of provenance in general. - -- [_Transparent Result Caching_ by Vahdat and Anderson in USENIX ATC 1998](https://www.usenix.org/legacy/publications/library/proceedings/usenix98/full_papers/vahdat/vahdat.pdf) for an early system-level provenance tracer in Solaris using the `/proc` fs. Linux's `/proc` fs doesn't have the same functionality. However, this paper discusses two interesting application of provenance: unmake (query lineage information) and transparent Make (incremental computation without explicit dependency declaration). - -- [_CDE: Using System Call Interposition to Automatically Create Portable Software Packages_ by Guo and Engler in USENIX ATC 2011](https://www.usenix.org/legacy/events/atc11/tech/final_files/GuoEngler.pdf) for an early system-level provenance tracer. Their only application is software execution replay, but replay is quite an important application. - -- [_Techniques for Preserving Scientific Software Executions: Preserve the Mess or Encourage Cleanliness?_ by Thain, Meng, and Ivie in 2015 ](https://curate.nd.edu/articles/journal_contribution/Techniques_for_Preserving_Scientific_Software_Executions_Preserve_the_Mess_or_Encourage_Cleanliness_/24824439?file=43664937) discusses whether enabling automatic-replay is actually a good idea. A cursory glance makes PROBE seem more like "preserving the mess", but I think, with some care in the design choices, it actually can be more like "encouraging cleanliness", for example, by having heuristics that help cull/simplify provenance and generating human readable/editable package-manager recipes. - -- [_SoK: History is a Vast Early Warning System: Auditing the Provenance of System Intrusions_ by Inam et al. in IEEE Symposium on Security and Privacy 2023](https://adambates.org/documents/Inam_Oakland23.pdf) see specifically Inam's survey of different possibilities for the "Capture layer", "Reduction layer", and "Infrastructure layer". Although provenance-for-security has different constraints than provenacne for other purposes, the taxonomy that Inam lays out is still useful. PROBE operates by intercepting libc calls, which is essentially a "middleware" in Table I (platform modification, no program modification, no config change, incomplete mediation, not tamperproof, inter-process tracing, etc.). - -- [_System-Level Provenance Tracers_ by me et al. in ACM REP 2023](./docs/acm-rep-pres.pdf) for a motivation of this work. It surveys prior work, identifies potential gaps, and explains why I think library interposition is a promising path for future research. - -- [_Computational Experiment Comprehension using Provenance Summarization_ by Bufford et al. in ACM REP 2023](https://dl.acm.org/doi/pdf/10.1145/3641525.3663617) discusses how to implement an interface for querying provenance information. They compare classical graph-based visualization with an interactive LLM in a user-study. - -## Prior art - -- [RR-debugger](https://github.com/rr-debugger/rr) which is much slower, but features more complete capturing, lets you replay but doesn't let you do any other analysis. - -- [Sciunits](https://github.com/depaul-dice/sciunit) which is much slower, more likely to crash, has less complete capturing, lets you replay but doesn't let you do other analysis. - -- [Reprozip](https://www.reprozip.org/) which is much slower and has less complete capturing. - -- [CARE](https://proot-me.github.io/care/) which is much slower, has less complete capturing, and lets you do containerized replay but not unpriveleged native replay and not other analysis. - -- [FSAtrace](https://github.com/jacereda/fsatrace) which is more likely to crash, has less complete capturing, and doesn't have replay or other analyses. +## Directory structure + +- `libprobe`: Library that implements interposition (C, Make, Python; happens to be manual and code-gen). + - `libprobe/include`: Headers that will be used by the Rust wrapper to read PROBE data. + - `libprobe/src`: Main C sources of `libprobe`. + - `libprobe/generator`: Python and C-template code-generator. + - `libprobe/generated`: (Generated, not committed to Git) output of code-generation. + - `libprobe/Makefile`: Makefile that runs all of `libprobe`; run `just compile-cli` to invoke. +- `cli-wrapper`: (Cargo workspace) code that wraps libprobe. + - `cli-wrapper/cli`: (Cargo crate) main CLI. + - `cli-wrapper/lib`: (Cargo crate) supporting library functions. + - `cli-wrapper/macros`: (Cargo crate) supporting macros; they use structs from `libprobe/include` to create Rust structs and Python dataclasses. + - `cli-wrapper/frontend.nix`: Nix code that builds the Cargo workspace; Gets included in `flake.nix`. +- `probe_py`: Python Code that implements analysis of PROBE data (happens to be manual and code-gen), should be added to `$PYTHONPATH` by `nix develop` + - `probe_py/probe_py`: Main package to be imported or run. + - `probe_py/pyproject.toml`: Definition of main package and dependencies. + - `probe_py/tests`: Python unittests, i.e., `from probe_py import foobar; test_foobar()`; Run `just test-py`. + - `probe_py/mypy_stubs`: "Stub" files that tell Mypy how to check untyped library code. Should be added to `$MYPYPATH` by `nix develop`. +- `tests`: End-to-end opaque-box tests. They will be run with Pytest, but they will not test Python directly; they should always `subprocess.run(["probe", ...])`. Additionally, some tests have to be manually invoked. +- `docs`: Documentation and papers. +- `benchmark`: Programs and infrastructure for benchmarking. + - `benchmark/REPRODUCING.md`: Read this first! +- `flake.nix`: Nix code that defines packages and the devshell. +- `setup_devshell.sh`: Helps instantiate Nix devshell. +- `Justfile`: "Shortcuts" for defining and running common commands (e.g., `just --list`). diff --git a/probe_src/benchmark_results.csv b/benchmark/PROBE_small_bench.csv similarity index 100% rename from probe_src/benchmark_results.csv rename to benchmark/PROBE_small_bench.csv diff --git a/probe_src/frontend/Cargo.lock b/cli-wrapper/Cargo.lock similarity index 99% rename from probe_src/frontend/Cargo.lock rename to cli-wrapper/Cargo.lock index 04d1a6d1..b6fc69b4 100644 --- a/probe_src/frontend/Cargo.lock +++ b/cli-wrapper/Cargo.lock @@ -773,7 +773,7 @@ dependencies = [ "flate2", "libc", "log", - "probe_frontend", + "probe_lib", "rand", "serde", "serde_json", @@ -781,7 +781,7 @@ dependencies = [ ] [[package]] -name = "probe_frontend" +name = "probe_lib" version = "0.2.0" dependencies = [ "bindgen", diff --git a/probe_src/frontend/Cargo.toml b/cli-wrapper/Cargo.toml similarity index 94% rename from probe_src/frontend/Cargo.toml rename to cli-wrapper/Cargo.toml index 5b25b713..2c9072fc 100644 --- a/probe_src/frontend/Cargo.toml +++ b/cli-wrapper/Cargo.toml @@ -1,8 +1,8 @@ [workspace] resolver = "2" -members = [ +members = [ "cli", - "lib", + "lib", "macros", ] diff --git a/probe_src/frontend/LICENSE b/cli-wrapper/LICENSE similarity index 100% rename from probe_src/frontend/LICENSE rename to cli-wrapper/LICENSE diff --git a/probe_src/frontend/README.md b/cli-wrapper/README.md similarity index 100% rename from probe_src/frontend/README.md rename to cli-wrapper/README.md diff --git a/probe_src/frontend/cli/Cargo.toml b/cli-wrapper/cli/Cargo.toml similarity index 92% rename from probe_src/frontend/cli/Cargo.toml rename to cli-wrapper/cli/Cargo.toml index 4c1ebdc8..968d2dfd 100644 --- a/probe_src/frontend/cli/Cargo.toml +++ b/cli-wrapper/cli/Cargo.toml @@ -19,7 +19,7 @@ exec = "0.3.1" flate2 = "1.0.30" libc = "0.2.155" log = "0.4.21" -probe_frontend = { path = "../lib" } +probe_lib = { path = "../lib" } rand = "0.8.5" serde = "1.0.203" serde_json = "1.0.118" diff --git a/probe_src/frontend/cli/src/dump.rs b/cli-wrapper/cli/src/dump.rs similarity index 99% rename from probe_src/frontend/cli/src/dump.rs rename to cli-wrapper/cli/src/dump.rs index 92ae7861..d175df30 100644 --- a/probe_src/frontend/cli/src/dump.rs +++ b/cli-wrapper/cli/src/dump.rs @@ -7,7 +7,7 @@ use std::{ use chrono::{DateTime, SecondsFormat}; use color_eyre::eyre::{eyre, Result, WrapErr}; -use probe_frontend::ops; +use probe_lib::ops; use serde::{Deserialize, Serialize}; /// Print the ops from a probe log out for humans. diff --git a/probe_src/frontend/cli/src/main.rs b/cli-wrapper/cli/src/main.rs similarity index 98% rename from probe_src/frontend/cli/src/main.rs rename to cli-wrapper/cli/src/main.rs index 6be6be38..4931f626 100644 --- a/probe_src/frontend/cli/src/main.rs +++ b/cli-wrapper/cli/src/main.rs @@ -10,7 +10,7 @@ mod dump; /// Run commands under provenance and generate probe record directory. mod record; -/// Wrapper over [`probe_frontend::transcribe`]. +/// Wrapper over [`probe_lib::transcribe`]. mod transcribe; /// Utility code for creating temporary directories. @@ -163,7 +163,7 @@ fn main() -> Result<()> { let exit = std::process::Command::new("python3") .arg("-m") - .arg("probe_py.manual.cli") + .arg("probe_py.cli") .arg(subcommand) .args(&args) .spawn() diff --git a/probe_src/frontend/cli/src/record.rs b/cli-wrapper/cli/src/record.rs similarity index 100% rename from probe_src/frontend/cli/src/record.rs rename to cli-wrapper/cli/src/record.rs diff --git a/probe_src/frontend/cli/src/transcribe.rs b/cli-wrapper/cli/src/transcribe.rs similarity index 88% rename from probe_src/frontend/cli/src/transcribe.rs rename to cli-wrapper/cli/src/transcribe.rs index 799df9b1..d3364060 100644 --- a/probe_src/frontend/cli/src/transcribe.rs +++ b/cli-wrapper/cli/src/transcribe.rs @@ -10,7 +10,7 @@ pub fn transcribe, T: Write>( ) -> Result<()> { let log_dir = Dir::temp(true).wrap_err("Failed to create temp directory for transcription")?; - probe_frontend::transcribe::parse_top_level(record_dir, &log_dir) + probe_lib::transcribe::parse_top_level(record_dir, &log_dir) .wrap_err("Failed to transcribe record directory")?; tar.append_dir_all(".", &log_dir) diff --git a/probe_src/frontend/cli/src/util.rs b/cli-wrapper/cli/src/util.rs similarity index 100% rename from probe_src/frontend/cli/src/util.rs rename to cli-wrapper/cli/src/util.rs diff --git a/probe_src/frontend/deny.toml b/cli-wrapper/deny.toml similarity index 100% rename from probe_src/frontend/deny.toml rename to cli-wrapper/deny.toml diff --git a/cli-wrapper/frontend.nix b/cli-wrapper/frontend.nix new file mode 100644 index 00000000..8335eaa1 --- /dev/null +++ b/cli-wrapper/frontend.nix @@ -0,0 +1,136 @@ +{ + pkgs, + craneLib, + rust-target, + advisory-db, + system, + python, + lib, +}: rec { + # See https://crane.dev/examples/quick-start-workspace.html + + src = craneLib.cleanCargoSource ./.; + + # Common arguments can be set here to avoid repeating them later + commonArgs = { + inherit src; + strictDeps = true; + + # all the crates in this workspace either use rust-bindgen or depend + # on local crate that does. + nativeBuildInputs = [ + pkgs.rustPlatform.bindgenHook + ]; + + CARGO_BUILD_TARGET = rust-target; + CARGO_BUILD_RUSTFLAGS = "-C target-feature=+crt-static"; + CPATH = ../libprobe/include; + + # pygen needs to know where to write the python file + preConfigurePhases = [ + "pygenConfigPhase" + ]; + pygenConfigPhase = '' + export PYGEN_OUTFILE="$out/resources/ops.py" + mkdir --parents "$(dirname "$PYGEN_OUTFILE")" + echo "Sending python code to $PYGEN_OUTFILE" + ''; + }; + + # Build *just* the cargo dependencies (of the entire workspace), + # so we can reuse all of that work (e.g. via cachix) when running in CI + # It is *highly* recommended to use something like cargo-hakari to avoid + # cache misses when building individual top-level-crates + cargoArtifacts = craneLib.buildDepsOnly commonArgs; + + individualCrateArgs = + commonArgs + // { + inherit cargoArtifacts; + inherit (craneLib.crateNameFromCargoToml {inherit src;}) version; + # disable tests since we'll run them all via cargo-nextest + doCheck = false; + }; + + fileSetForCrate = crates: + lib.fileset.toSource { + root = ./.; + fileset = lib.fileset.unions ([ + ./Cargo.toml + ./Cargo.lock + ] + ++ (builtins.map craneLib.fileset.commonCargoSources crates)); + }; + + packages = rec { + inherit cargoArtifacts; + + # Prior to this version, the old code had one derivatino per crate (probe-cli, probe-lib, and probe-macros). + # What could go wrong? + # Since the old version used `src = ./.`, it would rebuild all three if any one changed. + + # craneLib's workspace example [1] says to use `src = fileSetForCrate ./path/to/crate`. + # However, when I tried doing that, it would say "failed to load manifest for workspace member lib" because "failed to read macros/Cargo.toml". + # Because `lib/Cargo.toml` has a dependency on `{path = "../macros"}`, + # I think the source code of both crates have to be present at build-time of lib. + # Which means no source filtering is possible. + # Indeed the exposed packages in craneLib's example (my-cli and my-server) [1] do not depend on each other. + # They depend on my-common, which is *not* filtered out (*is* included) in the `src` for those crates. + # If it's possible to simultaneously: + # - expose two Cargo crates A and B + # - where A depends on B + # - when A changes only A needs to be rebuilt + # then I don't know how to do it. + # Therefore, I will only offer one crate as a Nix package. + # + # https://crane.dev/examples/quick-start-workspace.html + + probe-cli = craneLib.buildPackage (individualCrateArgs + // { + pname = "probe-cli"; + cargoExtraArgs = "-p probe_cli"; + src = fileSetForCrate [ + ./cli + ./lib + ./macros + ]; + }); + }; + checks = { + probe-workspace-clippy = craneLib.cargoClippy (commonArgs + // { + inherit (packages) cargoArtifacts; + cargoClippyExtraArgs = "--all-targets -- --deny warnings"; + }); + + probe-workspace-doc = craneLib.cargoDoc (commonArgs + // { + inherit (packages) cargoArtifacts; + }); + + # Check formatting + probe-workspace-fmt = craneLib.cargoFmt { + inherit src; + }; + + # Audit dependencies + probe-workspace-audit = craneLib.cargoAudit { + inherit src advisory-db; + }; + + # Audit licenses + probe-workspace-deny = craneLib.cargoDeny { + inherit src; + }; + + # Run tests with cargo-nextest + # this is why `doCheck = false` on the crate derivations, so as to not + # run the tests twice. + probe-workspace-nextest = craneLib.cargoNextest (commonArgs + // { + inherit (packages) cargoArtifacts; + partitions = 1; + partitionType = "count"; + }); + }; +} diff --git a/probe_src/frontend/lib/Cargo.toml b/cli-wrapper/lib/Cargo.toml similarity index 92% rename from probe_src/frontend/lib/Cargo.toml rename to cli-wrapper/lib/Cargo.toml index 90b871e2..692cf5d9 100644 --- a/probe_src/frontend/lib/Cargo.toml +++ b/cli-wrapper/lib/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "probe_frontend" +name = "probe_lib" version.workspace = true license.workspace = true authors.workspace = true @@ -10,7 +10,7 @@ edition.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] -name = "probe_frontend" +name = "probe_lib" path = "src/lib.rs" [dependencies] diff --git a/probe_src/frontend/lib/build.rs b/cli-wrapper/lib/build.rs similarity index 100% rename from probe_src/frontend/lib/build.rs rename to cli-wrapper/lib/build.rs diff --git a/probe_src/frontend/lib/src/error.rs b/cli-wrapper/lib/src/error.rs similarity index 100% rename from probe_src/frontend/lib/src/error.rs rename to cli-wrapper/lib/src/error.rs diff --git a/probe_src/frontend/lib/src/lib.rs b/cli-wrapper/lib/src/lib.rs similarity index 100% rename from probe_src/frontend/lib/src/lib.rs rename to cli-wrapper/lib/src/lib.rs diff --git a/probe_src/frontend/lib/src/metadata.rs b/cli-wrapper/lib/src/metadata.rs similarity index 100% rename from probe_src/frontend/lib/src/metadata.rs rename to cli-wrapper/lib/src/metadata.rs diff --git a/probe_src/frontend/lib/src/ops.rs b/cli-wrapper/lib/src/ops.rs similarity index 100% rename from probe_src/frontend/lib/src/ops.rs rename to cli-wrapper/lib/src/ops.rs diff --git a/probe_src/frontend/lib/src/transcribe.rs b/cli-wrapper/lib/src/transcribe.rs similarity index 100% rename from probe_src/frontend/lib/src/transcribe.rs rename to cli-wrapper/lib/src/transcribe.rs diff --git a/probe_src/frontend/macros/Cargo.toml b/cli-wrapper/macros/Cargo.toml similarity index 100% rename from probe_src/frontend/macros/Cargo.toml rename to cli-wrapper/macros/Cargo.toml diff --git a/probe_src/frontend/macros/src/lib.rs b/cli-wrapper/macros/src/lib.rs similarity index 100% rename from probe_src/frontend/macros/src/lib.rs rename to cli-wrapper/macros/src/lib.rs diff --git a/probe_src/frontend/macros/src/pygen.rs b/cli-wrapper/macros/src/pygen.rs similarity index 100% rename from probe_src/frontend/macros/src/pygen.rs rename to cli-wrapper/macros/src/pygen.rs diff --git a/probe_src/frontend/python/pyproject.toml b/cli-wrapper/pyproject.toml similarity index 100% rename from probe_src/frontend/python/pyproject.toml rename to cli-wrapper/pyproject.toml diff --git a/probe_src/README.md b/docs/developers_readme.md similarity index 97% rename from probe_src/README.md rename to docs/developers_readme.md index fd0c9432..44913ee5 100644 --- a/probe_src/README.md +++ b/docs/developers_readme.md @@ -77,3 +77,9 @@ I don't like replaying libcalls by intercepting and copy/pasting contents of the The entire close of `temp_file` in PID 101 precedes waitpid 101 in fork-join order, waitpid 101 precedes fork 102 in program order, and fork 102 precedess the open of `temp_file` in PID 102, so we can conclude that information may flow from `foo` to `bar`. From which, we can deduce the provenance graph `input_file` -> `foo` -> `temp_file` -> `bar` -> `output_file` with only the open/close intervals and happens-before order. + +# Python package + +probe_py is a package that implements experimental, non-core CLI functions of PROBE and Python library functionality of PROBE. + +Required reading: diff --git a/docs/new_paper_todos.md b/docs/new_paper_todos.md index 7f947de3..889158b2 100644 --- a/docs/new_paper_todos.md +++ b/docs/new_paper_todos.md @@ -1,8 +1,15 @@ **Tasks before resubmitting "performance of provenance" paper**: 1. Refactor technical debt in PROBE. Already have a branch with refactoring complete, but need to sync it up, merge it, and rebase current branches onto it -2. Implement more benchmarks. -3. Create statistics described here: https://github.com/users/charmoniumQ/projects/1/views/1?pane=issue&itemId=94217621 +2. Fix broken benchmarks and provenance tracers +3. Look into benchmark non-determinism: faketime, aslr, taskset, machine id, file cache rerun, clear cache. +4. Implement more benchmarks. +5. Create statistics described here: https://github.com/users/charmoniumQ/projects/1/views/1?pane=issue&itemId=94217621 +6. Note "actively maintained" +7. Run SSH remotely +8. Review: + - https://gernot-heiser.org/benchmarking-crimes.html + - https://www2.sigsoft.org/EmpiricalStandards/docs/standards?standard=Benchmarking **Tasks before writing "Measuring the level of determinism in common record/replay tools" paper**: @@ -11,7 +18,7 @@ 1. In the previous benchmark suite, how many workloads are bit-wise reproducible. 2. For each non-deterministic input, manipulate the input, how many previously bit-wise reproducible workloads are still bit-wise reproducible. This determines its "probability of this non-det input impacting output." 3. Create a "determinism completeness table" showing whether each tool (columns) records-and-replays, stabilizes, or detects each particular non-deterministic input (rows). Validate this with an application (could be synthetic) whose output is deterministic for the "yesses" and non-deterministic for the "noes". At the end of each column, what proportion of benchmarks only depended on the "yesses" for that column? - - Recording-and-replaying an input := record value is unchanged from native execution, replay value uses the recorded value. + - Recording-and-replaying an input := record value is unchanged from native execution, replay value uses the recorded value. - Stabalizing an input := changing the record and replay to have the same value. - Detecting an input := detecting when the program accesses a particular input (neither stabalizing nor recording-and-replaying it). 4. Create a "portability table" whose rows are pairs and columns are whether record/replay X allows it, allows it reproducibly, or disallows it. @@ -24,9 +31,10 @@ 1. Debug OCI image recording of Python 2. Debug multiple extraneous versions appearing the dataflow graph -3. Compute and store DFG at transcription time. Currently this is a separate command because transcription is (and must be for compatibility) implemented in Rust while DFG is implemented in Python. -4. DFG should be based on persistent provenance -5. Develop heuristics for converting provenance graph to a run script. +3. Include symlinks properly in dataflow graph. +4. Compute and store DFG at transcription time. Currently this is a separate command because transcription is (and must be for compatibility) implemented in Rust while DFG is implemented in Python. +5. DFG should be based on persistent provenance +6. Develop heuristics for converting provenance graph to a run script. **Tasks for user-study**: diff --git a/probe_src/notes.md b/docs/notes.md similarity index 100% rename from probe_src/notes.md rename to docs/notes.md diff --git a/tasks.org b/docs/old_tasks.org similarity index 100% rename from tasks.org rename to docs/old_tasks.org diff --git a/docs/.gitignore b/docs/publications/.gitignore similarity index 100% rename from docs/.gitignore rename to docs/publications/.gitignore diff --git a/docs/acm-rep-pres.html b/docs/publications/acm-rep-pres.html similarity index 100% rename from docs/acm-rep-pres.html rename to docs/publications/acm-rep-pres.html diff --git a/docs/acm-rep-pres.pdf b/docs/publications/acm-rep-pres.pdf similarity index 100% rename from docs/acm-rep-pres.pdf rename to docs/publications/acm-rep-pres.pdf diff --git a/docs/acm-template.tex b/docs/publications/acm-template.tex similarity index 100% rename from docs/acm-template.tex rename to docs/publications/acm-template.tex diff --git a/docs/benchmark_suite/README.md b/docs/publications/benchmark_suite/README.md similarity index 100% rename from docs/benchmark_suite/README.md rename to docs/publications/benchmark_suite/README.md diff --git a/docs/benchmark_suite/README.pdf b/docs/publications/benchmark_suite/README.pdf similarity index 100% rename from docs/benchmark_suite/README.pdf rename to docs/publications/benchmark_suite/README.pdf diff --git a/docs/benchmark_suite/app-lvl-prov.dot b/docs/publications/benchmark_suite/app-lvl-prov.dot similarity index 100% rename from docs/benchmark_suite/app-lvl-prov.dot rename to docs/publications/benchmark_suite/app-lvl-prov.dot diff --git a/docs/benchmark_suite/generated/clustering2.svg b/docs/publications/benchmark_suite/generated/clustering2.svg similarity index 100% rename from docs/benchmark_suite/generated/clustering2.svg rename to docs/publications/benchmark_suite/generated/clustering2.svg diff --git a/docs/benchmark_suite/generated/dendrogram.pdf b/docs/publications/benchmark_suite/generated/dendrogram.pdf similarity index 100% rename from docs/benchmark_suite/generated/dendrogram.pdf rename to docs/publications/benchmark_suite/generated/dendrogram.pdf diff --git a/docs/benchmark_suite/generated/dendrogram.svg b/docs/publications/benchmark_suite/generated/dendrogram.svg similarity index 100% rename from docs/benchmark_suite/generated/dendrogram.svg rename to docs/publications/benchmark_suite/generated/dendrogram.svg diff --git a/docs/benchmark_suite/generated/dendrogram_full.pdf b/docs/publications/benchmark_suite/generated/dendrogram_full.pdf similarity index 100% rename from docs/benchmark_suite/generated/dendrogram_full.pdf rename to docs/publications/benchmark_suite/generated/dendrogram_full.pdf diff --git a/docs/benchmark_suite/generated/log_overhead_hist.svg b/docs/publications/benchmark_suite/generated/log_overhead_hist.svg similarity index 100% rename from docs/benchmark_suite/generated/log_overhead_hist.svg rename to docs/publications/benchmark_suite/generated/log_overhead_hist.svg diff --git a/docs/benchmark_suite/generated/pca0.pdf b/docs/publications/benchmark_suite/generated/pca0.pdf similarity index 100% rename from docs/benchmark_suite/generated/pca0.pdf rename to docs/publications/benchmark_suite/generated/pca0.pdf diff --git a/docs/benchmark_suite/generated/pca1.pdf b/docs/publications/benchmark_suite/generated/pca1.pdf similarity index 100% rename from docs/benchmark_suite/generated/pca1.pdf rename to docs/publications/benchmark_suite/generated/pca1.pdf diff --git a/docs/benchmark_suite/generated/predictive-performance.pdf b/docs/publications/benchmark_suite/generated/predictive-performance.pdf similarity index 100% rename from docs/benchmark_suite/generated/predictive-performance.pdf rename to docs/publications/benchmark_suite/generated/predictive-performance.pdf diff --git a/docs/benchmark_suite/generated/subsetting-accuracy.pdf b/docs/publications/benchmark_suite/generated/subsetting-accuracy.pdf similarity index 100% rename from docs/benchmark_suite/generated/subsetting-accuracy.pdf rename to docs/publications/benchmark_suite/generated/subsetting-accuracy.pdf diff --git a/docs/benchmark_suite/generated/subsetting-dist.pdf b/docs/publications/benchmark_suite/generated/subsetting-dist.pdf similarity index 100% rename from docs/benchmark_suite/generated/subsetting-dist.pdf rename to docs/publications/benchmark_suite/generated/subsetting-dist.pdf diff --git a/docs/benchmark_suite/generated/subsetting.pdf b/docs/publications/benchmark_suite/generated/subsetting.pdf similarity index 100% rename from docs/benchmark_suite/generated/subsetting.pdf rename to docs/publications/benchmark_suite/generated/subsetting.pdf diff --git a/docs/benchmark_suite/prov-example.dot b/docs/publications/benchmark_suite/prov-example.dot similarity index 100% rename from docs/benchmark_suite/prov-example.dot rename to docs/publications/benchmark_suite/prov-example.dot diff --git a/docs/benchmark_suite/prov-example.png b/docs/publications/benchmark_suite/prov-example.png similarity index 100% rename from docs/benchmark_suite/prov-example.png rename to docs/publications/benchmark_suite/prov-example.png diff --git a/docs/benchmark_suite/simple-prov-example.dot b/docs/publications/benchmark_suite/simple-prov-example.dot similarity index 100% rename from docs/benchmark_suite/simple-prov-example.dot rename to docs/publications/benchmark_suite/simple-prov-example.dot diff --git a/docs/benchmark_suite/simple-prov-example.png b/docs/publications/benchmark_suite/simple-prov-example.png similarity index 100% rename from docs/benchmark_suite/simple-prov-example.png rename to docs/publications/benchmark_suite/simple-prov-example.png diff --git a/docs/benchmark_suite/submitted.pdf b/docs/publications/benchmark_suite/submitted.pdf similarity index 100% rename from docs/benchmark_suite/submitted.pdf rename to docs/publications/benchmark_suite/submitted.pdf diff --git a/docs/benchmark_suite/sys-lvl-log.svg b/docs/publications/benchmark_suite/sys-lvl-log.svg similarity index 100% rename from docs/benchmark_suite/sys-lvl-log.svg rename to docs/publications/benchmark_suite/sys-lvl-log.svg diff --git a/docs/benchmark_suite/sys-lvl-prov.dot b/docs/publications/benchmark_suite/sys-lvl-prov.dot similarity index 100% rename from docs/benchmark_suite/sys-lvl-prov.dot rename to docs/publications/benchmark_suite/sys-lvl-prov.dot diff --git a/docs/benchmark_suite/tech_report.md b/docs/publications/benchmark_suite/tech_report.md similarity index 100% rename from docs/benchmark_suite/tech_report.md rename to docs/publications/benchmark_suite/tech_report.md diff --git a/docs/benchmark_suite/wf-lvl-prov.dot b/docs/publications/benchmark_suite/wf-lvl-prov.dot similarity index 100% rename from docs/benchmark_suite/wf-lvl-prov.dot rename to docs/publications/benchmark_suite/wf-lvl-prov.dot diff --git a/docs/citations-to-latex.lua b/docs/publications/citations-to-latex.lua similarity index 100% rename from docs/citations-to-latex.lua rename to docs/publications/citations-to-latex.lua diff --git a/docs/dataflow-graph.png b/docs/publications/dataflow-graph.png similarity index 100% rename from docs/dataflow-graph.png rename to docs/publications/dataflow-graph.png diff --git a/docs/dataflow-graph.svg b/docs/publications/dataflow-graph.svg similarity index 100% rename from docs/dataflow-graph.svg rename to docs/publications/dataflow-graph.svg diff --git a/docs/flake.lock b/docs/publications/flake.lock similarity index 100% rename from docs/flake.lock rename to docs/publications/flake.lock diff --git a/docs/flake.nix b/docs/publications/flake.nix similarity index 100% rename from docs/flake.nix rename to docs/publications/flake.nix diff --git a/docs/illinois.png b/docs/publications/illinois.png similarity index 100% rename from docs/illinois.png rename to docs/publications/illinois.png diff --git a/docs/lit_review/application-level.dot b/docs/publications/lit_review/application-level.dot similarity index 100% rename from docs/lit_review/application-level.dot rename to docs/publications/lit_review/application-level.dot diff --git a/docs/lit_review/datasets.org b/docs/publications/lit_review/datasets.org similarity index 100% rename from docs/lit_review/datasets.org rename to docs/publications/lit_review/datasets.org diff --git a/docs/lit_review/flake.lock b/docs/publications/lit_review/flake.lock similarity index 100% rename from docs/lit_review/flake.lock rename to docs/publications/lit_review/flake.lock diff --git a/docs/lit_review/flake.nix b/docs/publications/lit_review/flake.nix similarity index 100% rename from docs/lit_review/flake.nix rename to docs/publications/lit_review/flake.nix diff --git a/docs/lit_review/index.pdf b/docs/publications/lit_review/index.pdf similarity index 100% rename from docs/lit_review/index.pdf rename to docs/publications/lit_review/index.pdf diff --git a/docs/lit_review/index.tex b/docs/publications/lit_review/index.tex similarity index 100% rename from docs/lit_review/index.tex rename to docs/publications/lit_review/index.tex diff --git a/docs/lit_review/lit_review.md b/docs/publications/lit_review/lit_review.md similarity index 100% rename from docs/lit_review/lit_review.md rename to docs/publications/lit_review/lit_review.md diff --git a/docs/lit_review/lit_review.turtle b/docs/publications/lit_review/lit_review.turtle similarity index 100% rename from docs/lit_review/lit_review.turtle rename to docs/publications/lit_review/lit_review.turtle diff --git a/docs/lit_review/meat.tex b/docs/publications/lit_review/meat.tex similarity index 100% rename from docs/lit_review/meat.tex rename to docs/publications/lit_review/meat.tex diff --git a/docs/lit_review/search.csv b/docs/publications/lit_review/search.csv similarity index 100% rename from docs/lit_review/search.csv rename to docs/publications/lit_review/search.csv diff --git a/docs/lit_review/system-level.dot b/docs/publications/lit_review/system-level.dot similarity index 100% rename from docs/lit_review/system-level.dot rename to docs/publications/lit_review/system-level.dot diff --git a/docs/lit_review/workflow-level.dot b/docs/publications/lit_review/workflow-level.dot similarity index 100% rename from docs/lit_review/workflow-level.dot rename to docs/publications/lit_review/workflow-level.dot diff --git a/docs/low_provenance_overhead/main.md b/docs/publications/low_provenance_overhead/main.md similarity index 100% rename from docs/low_provenance_overhead/main.md rename to docs/publications/low_provenance_overhead/main.md diff --git a/docs/poster/main.svg b/docs/publications/poster/main.svg similarity index 100% rename from docs/poster/main.svg rename to docs/publications/poster/main.svg diff --git a/docs/poster/poster.html b/docs/publications/poster/poster.html similarity index 100% rename from docs/poster/poster.html rename to docs/publications/poster/poster.html diff --git a/docs/poster/poster.pdf b/docs/publications/poster/poster.pdf similarity index 100% rename from docs/poster/poster.pdf rename to docs/publications/poster/poster.pdf diff --git a/docs/poster/poster.scrbl b/docs/publications/poster/poster.scrbl similarity index 100% rename from docs/poster/poster.scrbl rename to docs/publications/poster/poster.scrbl diff --git a/docs/probe-qr.svg b/docs/publications/probe-qr.svg similarity index 100% rename from docs/probe-qr.svg rename to docs/publications/probe-qr.svg diff --git a/docs/prov-example.svg b/docs/publications/prov-example.svg similarity index 100% rename from docs/prov-example.svg rename to docs/publications/prov-example.svg diff --git a/docs/prov_pres/main.md b/docs/publications/prov_pres/main.md similarity index 100% rename from docs/prov_pres/main.md rename to docs/publications/prov_pres/main.md diff --git a/docs/prov_pres/prov_example.dot b/docs/publications/prov_pres/prov_example.dot similarity index 100% rename from docs/prov_pres/prov_example.dot rename to docs/publications/prov_pres/prov_example.dot diff --git a/docs/prov_pres/prov_example.svg b/docs/publications/prov_pres/prov_example.svg similarity index 100% rename from docs/prov_pres/prov_example.svg rename to docs/publications/prov_pres/prov_example.svg diff --git a/docs/record_replay/.gitignore b/docs/publications/record_replay/.gitignore similarity index 100% rename from docs/record_replay/.gitignore rename to docs/publications/record_replay/.gitignore diff --git a/docs/record_replay/main.md b/docs/publications/record_replay/main.md similarity index 100% rename from docs/record_replay/main.md rename to docs/publications/record_replay/main.md diff --git a/docs/record_replay/main.pdf b/docs/publications/record_replay/main.pdf similarity index 100% rename from docs/record_replay/main.pdf rename to docs/publications/record_replay/main.pdf diff --git a/docs/record_replay/old-main.md b/docs/publications/record_replay/old-main.md similarity index 100% rename from docs/record_replay/old-main.md rename to docs/publications/record_replay/old-main.md diff --git a/docs/record_replay/repro_comparison.ods b/docs/publications/record_replay/repro_comparison.ods similarity index 100% rename from docs/record_replay/repro_comparison.ods rename to docs/publications/record_replay/repro_comparison.ods diff --git a/docs/record_replay/zotero.bib b/docs/publications/record_replay/zotero.bib similarity index 100% rename from docs/record_replay/zotero.bib rename to docs/publications/record_replay/zotero.bib diff --git a/docs/reed.bib b/docs/publications/reed.bib similarity index 100% rename from docs/reed.bib rename to docs/publications/reed.bib diff --git a/docs/sandia.svg b/docs/publications/sandia.svg similarity index 100% rename from docs/sandia.svg rename to docs/publications/sandia.svg diff --git a/docs/script.sh b/docs/publications/script.sh similarity index 100% rename from docs/script.sh rename to docs/publications/script.sh diff --git a/docs/supplemental.bib b/docs/publications/supplemental.bib similarity index 100% rename from docs/supplemental.bib rename to docs/publications/supplemental.bib diff --git a/docs/us-rse.html b/docs/publications/us-rse.html similarity index 100% rename from docs/us-rse.html rename to docs/publications/us-rse.html diff --git a/docs/us-rse.pdf b/docs/publications/us-rse.pdf similarity index 100% rename from docs/us-rse.pdf rename to docs/publications/us-rse.pdf diff --git a/docs/why_prov/main.md b/docs/publications/why_prov/main.md similarity index 100% rename from docs/why_prov/main.md rename to docs/publications/why_prov/main.md diff --git a/docs/zotero.bib b/docs/publications/zotero.bib similarity index 100% rename from docs/zotero.bib rename to docs/publications/zotero.bib diff --git a/docs/publications_index.md b/docs/publications_index.md new file mode 100644 index 00000000..6e89e118 --- /dev/null +++ b/docs/publications_index.md @@ -0,0 +1,15 @@ +- Evaluating system-level provenance tools for practical use by Grayson, Milewicz + - 🔓 [fulltext](./publications/lit_review/index.pdf) + +- Trick or Research + - 🔓 [presentation](./publications/prov_pres/main.md) + +- How to collect computational provenance by Grayson, Milewicz, Katz, Marinov + - 🔓 [poster](./publications/poster/poster.pdf) + +- A benchmark suite and performance analysis of user-space provenance collectors by Grayson, Aguilar, Milewicz, Katz, Darko @ ACM REP '24 [10.1145/3641525.3663627](https://doi.org/10.1145/3641525.3663627) + - 🔓 [fulltext](./publications/benchmark_suite/README.pdf) + - 🔓 [slides](./publications/acm-rep-pres.pdf) + +- PROBE4RSE: Provenance Replay/Observation Engine for Research Software Engineers by Grayson, Milewicz, Katz, Marinov @ US-RSE '24 [10.1145/3641525.3663627](https://doi.org/10.1145/3641525.3663627) + - 🔓 [slides](./publications/us-rse.pdf) diff --git a/docs/research_reading_list.md b/docs/research_reading_list.md new file mode 100644 index 00000000..76fd74c1 --- /dev/null +++ b/docs/research_reading_list.md @@ -0,0 +1,27 @@ +## Research reading list + +- [_Provenance for Computational Tasks: A Survey_ by Freire et al. in CiSE '08](https://sci.utah.edu/~csilva/papers/cise2008a.pdf) for an overview of provenance in general. + +- [_Transparent Result Caching_ by Vahdat and Anderson @ USENIX ATC '98](https://www.usenix.org/legacy/publications/library/proceedings/usenix98/full_papers/vahdat/vahdat.pdf) for an early system-level provenance tracer in Solaris using the `/proc` fs. Linux's `/proc` fs doesn't have the same functionality. However, this paper discusses two interesting application of provenance: unmake (query lineage information) and transparent Make (more generally, incremental computation). + +- [_CDE: Using System Call Interposition to Automatically Create Portable Software Packages_ by Guo and Engler @ USENIX ATC '11](https://www.usenix.org/legacy/events/atc11/tech/final_files/GuoEngler.pdf) for an early system-level provenance tracer. Their only application is software execution replay, but replay is quite an important application. + +- [_Techniques for Preserving Scientific Software Executions: Preserve the Mess or Encourage Cleanliness?_ by Thain, Meng, and Ivie @ iPRES 2015 ](https://curate.nd.edu/articles/journal_contribution/Techniques_for_Preserving_Scientific_Software_Executions_Preserve_the_Mess_or_Encourage_Cleanliness_/24824439?file=43664937) discusses whether enabling automatic-replay is actually a good idea. A cursory glance makes PROBE seem more like "preserving the mess", but I think, with some care in the design choices, it actually can be more like "encouraging cleanliness", for example, by having heuristics that help cull/simplify provenance and generating human readable/editable package-manager recipes. + +- [_SoK: History is a Vast Early Warning System: Auditing the Provenance of System Intrusions_ by Inam et al. @ SOSP '23](https://adambates.org/documents/Inam_Oakland23.pdf) see specifically Inam's survey of different possibilities for the "Capture layer", "Reduction layer", and "Infrastructure layer". Although provenance-for-security has different constraints than provenacne for other purposes, the taxonomy that Inam lays out is still useful. PROBE operates by intercepting libc calls, which is essentially a "middleware" in Table I (platform modification, no program modification, no config change, incomplete mediation, not tamperproof, inter-process tracing, etc.). + +- [_System-Level Provenance Tracers_ by me et al. @ ACM REP 2023](./docs/acm-rep-pres.pdf) for a motivation of this work. It surveys prior work, identifies potential gaps, and explains why I think library interposition is a promising path for future research. + +- [_Computational Experiment Comprehension using Provenance Summarization_ by Bufford et al. @ ACM REP 2023](https://dl.acm.org/doi/pdf/10.1145/3641525.3663617) discusses how to implement an interface for querying provenance information. They compare classical graph-based visualization with an interactive LLM in a user-study. + +## Prior art + +- [RR-debugger](https://github.com/rr-debugger/rr) which is much slower, but features more complete capturing, lets you replay but doesn't let you do any other analysis. + +- [Sciunits](https://github.com/depaul-dice/sciunit) which is much slower, more likely to crash, has less complete capturing, lets you replay but doesn't let you do other analysis. + +- [Reprozip](https://www.reprozip.org/) which is much slower and has less complete capturing. + +- [CARE](https://proot-me.github.io/care/) which is much slower, has less complete capturing, and lets you do containerized replay but not unpriveleged native replay and not other analysis. + +- [FSAtrace](https://github.com/jacereda/fsatrace) which is more likely to crash, has less complete capturing, and doesn't have replay or other analyses. diff --git a/probe_src/tasks.md b/docs/tasks.md similarity index 84% rename from probe_src/tasks.md rename to docs/tasks.md index 4ed00ec3..31b4a383 100644 --- a/probe_src/tasks.md +++ b/docs/tasks.md @@ -26,6 +26,20 @@ Core functionality: - Provenance graph should get stored in user-wide directory. - It should be SQLite. +- [ ] We should record rusage of each process. + - Include: + - Time of start + - Time of stop + - Compute time + - IO + - MaxRSS + - [ ] Render that information somewhere? Maybe generated Makefile or Workflow should print wall time estimate, based on the planned computational steps. + +- [ ] Discuss Windows and MacOS implementation?? + - https://en.wikipedia.org/wiki/DLL_injection#Approaches_on_Microsoft_Windows + - MacOS: `DYLD_INSERT_LIBRARIES="./test.dylib" DYLD_FORCE_FLAT_NAMESPACE=1 prog` + - [Detours: Binary interception of Win32 functions ](https://www.usenix.org/legacy/publications/library/proceedings/usenix-nt99/full_papers/hunt/hunt.pdf) + Core tests: - [x] Write end-to-end-tests. End-to-end test should verify properties of the NetworkX graph returned by `provlog_to_digraph`. - [x] Check generic properties Shofiya and Sam finished this. @@ -59,9 +73,9 @@ Core tests: - [x] Write a CI script that uses Nix to install dependencies and run the Justfiles. - [x] Check (not format) code in Alejandra and Black. - [x] Figure out why tests don't work. - - [ ] Run tests in an Ubuntu Docker container. - - [ ] Run tests in a really old Ubuntu Docker container. - - [ ] Figure out how to intelligently combine Nix checks, Just checks, and GitHub CI checks, so we aren't duplicating checks. + - [x] Run tests in an Ubuntu Docker container. + - [x] Run tests in a really old Ubuntu Docker container. + - [x] Figure out how to intelligently combine Nix checks, Just checks, and GitHub CI checks, so we aren't duplicating checks. - [ ] Clang-analyzer - [x] Write microbenchmarking - [x] Run performance test-cases in two steps: one with just libprobe record and one with just transcription. (3 new CLI entrypoints, described in comments in CLI.py) @@ -69,13 +83,11 @@ Core tests: Downstream applications: - [ ] Should export the PROBE log to the following formats: - - [ ] [OCI image](https://opencontainers.org/) (runnable with Docker) + - [x] [OCI image](https://opencontainers.org/) (runnable with Docker) - [ ] Test that executing this image produces the same stdout, stderr, and files for the tests we already have. - - [ ] Tar-ball intended for chroot - - [ ] Directory - [ ] VM image. - [ ] Test execution again. - - [ ] Research ways to speed up the recording phase. + - [ ] Commented script. Comments would include files in, out, and time taken - [ ] SSH wrapper - [ ] There should be a shell script named `ssh` that calls `./PROBE ssh `. @@ -119,7 +131,6 @@ Design issues: - [ ] Think about in situ transcription and analysis - Think about assumptions in analysis - - Think about front-end and UI/UX Performance issues: - [ ] Have better benchmarks @@ -132,18 +143,23 @@ Performance issues: - [ ] Use lock-free implementation of InodeTable -- [ ] Try to break it. Jenna has some input on this. + +- [ ] Try to break it. - [ ] In one terminal execute: `probe record --overwrite --no-transcribe --output .workdir/log/0/probe/ python -m http.server 54123 --directory .workdir/work/0/simple'`. In another, execute `hey -n 50000 http://localhost:54123/test'`. Notice `__arena_reinstantiate: openat: Too many open files`. -Documentation: -- [ ] Make the CLI better. You shouldn't need to give `-f` to make repeated applications work. You shouldn't need to give `--input`. +- [ ] Put rdtsc performance counters in libprobe to instrument startup and re-exec cost. Write them to disk somehow. Look at the results. + +Better UI/UX: + +- [ ] Probe -o should support formatted-strings, including: %pid, %iso_datetime, %exe, %host, syntax subject to change. + - PROBE should default to `recording_%exe_%iso_datetime.tar.gz`; that way, you can run probe twice in a row with no error without needing `-f`. It's currently an unexpected pain-point, I think. CARE does something like `something.%pid`, I think. + +- [ ] Make the CLI better. You shouldn't need to give `--input`. - [ ] Document CLI tool. - [ ] Do we need to have a file-extension for probe_log? -- [ ] Combine Python and Rust CLIs. - - [ ] Improve the README. - [ ] Style output with Rich. @@ -151,13 +167,10 @@ Documentation: - [ ] Style output of Rust tool. - [ ] Package for the following platforms: - - [ ] It should be obvious how to build libprobe and probe cli (Rust) with Nix from the README. - - [ ] The repository should be an installable Python package, using the PEP 518 (pyproject.toml). Consider having one Python package with bundled binaries and one without. + - [x] Nix - [ ] PyPI - - [ ] Nix - [ ] Spack - [ ] Guix - - [ ] Docker image (consider whether to publish DockerHub, Quay, GHCR, or somewhere else). - [ ] Statically linked, downloadable binary - Built in CI on each "release" and downloadable from GitHub. @@ -166,10 +179,6 @@ Documentation: - [ ] Explain design decisions Nice to have: -- [ ] Make it easier to get to the debug build of probe cli. - - Build both versions, called `probe` and `probe_dbg`. - - `probe_dbg` should use `libprobe_dbg` - - Get rid of `--debug` - [ ] Don't check in generated code to VCS @@ -178,28 +187,22 @@ Nice to have: - [ ] Add more syscalls - [ ] Add Dup ops and debug `bash -c 'head foo > bar'` (branch add-new-ops). Sam is working on this -- [ ] Add more Ops (see branch add-new-ops) - - [ ] Libprobe should identify which was the "root" process. - [ ] Sort readdir order in record and replay phases. - [ ] Re-enable some of the tests I disabled. -- [ ] Write a FUSE that maps inodes (underlying fs) to inodes (of our choosing). Write an option for replay to use this FUSE. - -- [ ] Link with libbacktrace on `--debug` runs. - - [ ] Refactor some identifiers in codebase. - [ ] `prov_log_process_tree` -> `process_tree` - [ ] `prov_log` -> `probe_log` - [ ] `(pid, ex_id, tid, op_id)` -> `dataclass` - [ ] `digraph`, `process_graph` -> `hb_graph` - - [ ] Format Python with Ruff - - [ ] Use Clang's non-null attribute. - - [ ] Having fewer Python imports (e.g., generated.parser, generated.ops. Maybe we should re-export stuff in `__init__.py` of generated). - [ ] Reformat repository layout + +- [ ] Have `probe` and `probe-dbg`; `probe` load `libprobe.so`; `probe-dbg` loads `libprobe-dbg.so` and possibly `libbacktrace.so`. + +- [ ] Reformat repository layout - [ ] Probably have 1 top-level folder for each language, but make sure all the pieces compose nicely. - [ ] `reproducibility_tests` -> `tests`? - [ ] Move tests to root level? @@ -208,6 +211,10 @@ Nice to have: - [ ] Run pre-commit in GitHub Actions, committing fixes to PR branch +- [ ] We currently assume that coreutils will exist on the local and remote hosts. They might not. In that case, we should excise all invocations of coreutils. We could replace them with subcommands of PROBE, which _are_ guaranteed to exist on hosts that have PROBE. However, that doesn't feel necessary, since ost people _do_ have coreutils. So we will record this issue and do nothing until/unless someone complains. + +- [ ] Write a FUSE that maps inodes (underlying fs) to inodes (of our choosing). Write an option for replay to use this FUSE. + Research tasks: - [ ] Develop user study diff --git a/probe_src/threading.md b/docs/threading.md similarity index 100% rename from probe_src/threading.md rename to docs/threading.md diff --git a/flake.lock b/flake.lock index 4728dbc9..93e064e0 100644 --- a/flake.lock +++ b/flake.lock @@ -17,17 +17,12 @@ } }, "crane": { - "inputs": { - "nixpkgs": [ - "nixpkgs" - ] - }, "locked": { - "lastModified": 1721842668, - "narHash": "sha256-k3oiD2z2AAwBFLa4+xfU+7G5fisRXfkvrMTCJrjZzXo=", + "lastModified": 1731098351, + "narHash": "sha256-HQkYvKvaLQqNa10KEFGgWHfMAbWBfFp+4cAgkut+NNE=", "owner": "ipetkov", "repo": "crane", - "rev": "529c1a0b1f29f0d78fa3086b8f6a134c71ef3aaf", + "rev": "ef80ead953c1b28316cc3f8613904edc2eb90c28", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index caf8ab2b..0379068e 100644 --- a/flake.nix +++ b/flake.nix @@ -9,7 +9,6 @@ crane = { url = "github:ipetkov/crane"; - inputs.nixpkgs.follows = "nixpkgs"; }; advisory-db = { @@ -58,7 +57,7 @@ p.rust-bin.stable.latest.default.override { targets = [rust-target]; }); - frontend = (import ./probe_src/frontend/frontend.nix) { + frontend = (import ./cli-wrapper/frontend.nix) { inherit system pkgs @@ -71,11 +70,11 @@ }; in rec { packages = rec { - inherit (frontend.packages) cargoArtifacts; + inherit (frontend.packages) cargoArtifacts probe-cli; libprobe = pkgs.stdenv.mkDerivation rec { pname = "libprobe"; version = "0.1.0"; - src = ./probe_src/libprobe; + src = ./libprobe; makeFlags = ["INSTALL_PREFIX=$(out)" "SOURCE_VERSION=${version}"]; buildInputs = [ (pkgs.python312.withPackages (pypkgs: [ @@ -99,45 +98,49 @@ --prefix PATH : ${pkgs.buildah}/bin ''; }; - probe-py-generated = frontend.packages.probe-py-generated; - probe-py = let - probe-py-manual = python.pkgs.buildPythonPackage rec { - pname = "probe_py.manual"; + probe-py = python.pkgs.buildPythonPackage rec { + pname = "probe_py"; + version = "0.1.0"; + pyproject = true; + build-system = [ + python.pkgs.flit-core + ]; + src = pkgs.stdenv.mkDerivation { + src = ./probe_py; + pname = "probe-py-with-pygen-code"; version = "0.1.0"; - pyproject = true; - build-system = [ - python.pkgs.flit-core - ]; - src = ./probe_src/python; - propagatedBuildInputs = [ - # Packages the client will need - frontend.packages.probe-py-generated - python.pkgs.networkx - python.pkgs.pygraphviz - python.pkgs.pydot - python.pkgs.rich - python.pkgs.typer - python.pkgs.sqlalchemy - python.pkgs.xdg-base-dirs - python.pkgs.pyyaml - python.pkgs.types-pyyaml - ]; - nativeCheckInputs = [ - frontend.packages.probe-py-generated - python.pkgs.mypy - pkgs.ruff - ]; - checkPhase = '' - runHook preCheck - #ruff format --check probe_src # TODO: uncomment - ruff check . - python -c 'import probe_py.manual' - mypy --strict --package probe_py.manual - runHook postCheck + buildPhase = "true"; + installPhase = '' + mkdir $out/ + cp --recursive $src/* $out/ + chmod 755 $out/probe_py + cp ${probe-cli}/resources/ops.py $out/probe_py/ ''; }; - in - python.withPackages (pypkgs: [probe-py-manual]); + propagatedBuildInputs = [ + python.pkgs.networkx + python.pkgs.pygraphviz + python.pkgs.pydot + python.pkgs.rich + python.pkgs.typer + python.pkgs.xdg-base-dirs + python.pkgs.sqlalchemy + python.pkgs.pyyaml + ]; + nativeCheckInputs = [ + python.pkgs.mypy + python.pkgs.types-pyyaml + pkgs.ruff + ]; + checkPhase = '' + runHook preCheck + #ruff format --check probe_src # TODO: uncomment + ruff check . + python -c 'import probe_py' + MYPYPATH=$src/mypy_stubs:$MYPYPATH mypy --strict --package probe_py + runHook postCheck + ''; + }; default = probe-bundled; }; checks = { @@ -163,12 +166,12 @@ }; probe-integration-tests = pkgs.stdenv.mkDerivation { name = "probe-integration-tests"; - src = ./probe_src/tests; + src = ./tests; nativeBuildInputs = [ packages.probe-bundled - packages.probe-py pkgs.podman pkgs.docker + pkgs.coreutils # so we can `probe record head ...`, etc. ]; buildPhase = "touch $out"; checkPhase = '' @@ -179,14 +182,12 @@ devShells = { default = craneLib.devShell { shellHook = '' - pushd $(git rev-parse --show-toplevel) + pushd $(git rev-parse --show-toplevel) > /dev/null source ./setup_devshell.sh - popd + popd > /dev/null ''; inputsFrom = [ - frontend.packages.probe-frontend frontend.packages.probe-cli - frontend.packages.probe-macros ]; packages = [ @@ -212,6 +213,7 @@ pypkgs.pytest pypkgs.mypy pypkgs.ipython + pypkgs.xdg-base-dirs # libprobe build time requirement pypkgs.pycparser diff --git a/libprobe/.gitignore b/libprobe/.gitignore new file mode 100644 index 00000000..3227224f --- /dev/null +++ b/libprobe/.gitignore @@ -0,0 +1,3 @@ +# generated files +build/ +generated/ diff --git a/probe_src/libprobe/Makefile b/libprobe/Makefile similarity index 96% rename from probe_src/libprobe/Makefile rename to libprobe/Makefile index 68b4daeb..58366fb3 100644 --- a/probe_src/libprobe/Makefile +++ b/libprobe/Makefile @@ -19,7 +19,7 @@ build/lib%-dbg.so: $(SOURCE_FILES) $(GENERATED_FILES) gcc $(CFLAGS) $(DBGCFLAGS) -o $@ src/lib.c $(GENERATED_FILES): $(wildcard generator/*) - ./generator/gen_libc_hooks.py + python3 ./generator/gen_libc_hooks.py install: install -D --target-directory $(INSTALL_PREFIX)/lib/ build/lib*.so diff --git a/probe_src/libprobe/README.md b/libprobe/README.md similarity index 100% rename from probe_src/libprobe/README.md rename to libprobe/README.md diff --git a/probe_src/libprobe/arena/.gitignore b/libprobe/arena/.gitignore similarity index 100% rename from probe_src/libprobe/arena/.gitignore rename to libprobe/arena/.gitignore diff --git a/probe_src/libprobe/arena/Makefile b/libprobe/arena/Makefile similarity index 100% rename from probe_src/libprobe/arena/Makefile rename to libprobe/arena/Makefile diff --git a/probe_src/libprobe/arena/README.md b/libprobe/arena/README.md similarity index 100% rename from probe_src/libprobe/arena/README.md rename to libprobe/arena/README.md diff --git a/probe_src/libprobe/arena/include/arena.h b/libprobe/arena/include/arena.h similarity index 100% rename from probe_src/libprobe/arena/include/arena.h rename to libprobe/arena/include/arena.h diff --git a/probe_src/libprobe/arena/parse_arena.py b/libprobe/arena/parse_arena.py similarity index 100% rename from probe_src/libprobe/arena/parse_arena.py rename to libprobe/arena/parse_arena.py diff --git a/probe_src/libprobe/arena/test_arena.c b/libprobe/arena/test_arena.c similarity index 100% rename from probe_src/libprobe/arena/test_arena.c rename to libprobe/arena/test_arena.c diff --git a/probe_src/libprobe/generator/dump_ast.py b/libprobe/generator/dump_ast.py similarity index 100% rename from probe_src/libprobe/generator/dump_ast.py rename to libprobe/generator/dump_ast.py diff --git a/probe_src/libprobe/generator/gen_libc_hooks.py b/libprobe/generator/gen_libc_hooks.py similarity index 75% rename from probe_src/libprobe/generator/gen_libc_hooks.py rename to libprobe/generator/gen_libc_hooks.py index 05f4f3af..2ec6d7f3 100755 --- a/probe_src/libprobe/generator/gen_libc_hooks.py +++ b/libprobe/generator/gen_libc_hooks.py @@ -9,6 +9,8 @@ _T = typing.TypeVar("_T") + + def expect_type(typ: type[_T], data: typing.Any) -> _T: if not isinstance(data, typ): raise TypeError(f"Expected type {typ} for {data}") @@ -16,29 +18,52 @@ def expect_type(typ: type[_T], data: typing.Any) -> _T: if typing.TYPE_CHECKING: + class CGenerator: - def _parenthesize_if(self, n: Node, condition: typing.Callable[[Node], bool]) -> str: ... + def _parenthesize_if( + self, n: Node, condition: typing.Callable[[Node], bool] + ) -> str: ... def _generate_decl(self, n: pycparser.c_ast.Node) -> str: ... def visit(self, n: pycparser.c_ast.Node | str | list[str]) -> str: ... def _visit_expr(self, n: pycparser.c_ast.Node) -> str: ... def _make_indent(self) -> str: ... + indent_level: int + class Node: pass + class IdentifierType(Node): names: list[str] + def __init__(self, names: list[str]) -> None: ... + class Assignment(Node): def __init__(self, op: str, lvalue: Node, rvalue: Node): ... + op: str lvalue: Node rvalue: Node + class Compound(Node): block_items: list[Node] + class ID(Node): name: str + class Decl(Node): - def __init__(self, name: str, quals: list[str], align: list[str], storage: list[str], funcspec: list[str], type: TypeDecl, init: Node | None, bitsize : Node | None) -> None: ... + def __init__( + self, + name: str, + quals: list[str], + align: list[str], + storage: list[str], + funcspec: list[str], + type: TypeDecl, + init: Node | None, + bitsize: Node | None, + ) -> None: ... + name: str quals: list[Node] align: list[Node] @@ -47,15 +72,21 @@ def __init__(self, name: str, quals: list[str], align: list[str], storage: list[ type: TypeDecl init: Node | None bitsize: Node | None + class TypeDecl(Node): - def __init__(self, declname: str, quals: list[Node], align: Node | None, type: Node) -> None: ... + def __init__( + self, declname: str, quals: list[Node], align: Node | None, type: Node + ) -> None: ... + declname: str quals: list[Node] align: Node | None type: Node + class FuncDecl(Node): args: ParamList type: TypeDecl + class ParamList(Node): params: list[Decl] else: @@ -79,17 +110,21 @@ def visit_Assignment(self, n: Assignment) -> str: n.rvalue, lambda n: isinstance(n, (Assignment, Compound)), ) - return '%s %s %s' % (self.visit(n.lvalue), n.op, rval_str) + return "%s %s %s" % (self.visit(n.lvalue), n.op, rval_str) def visit_Decl(self, n: Decl, no_type: bool = False) -> str: s = n.name if no_type else self._generate_decl(n) if n.bitsize: - s += ' : ' + self.visit(n.bitsize) + s += " : " + self.visit(n.bitsize) if n.init: - s += ' = ' + self._parenthesize_if(n.init, lambda n: isinstance(n, (Assignment, pycparser.c_ast.Compound))) + s += " = " + self._parenthesize_if( + n.init, lambda n: isinstance(n, (Assignment, pycparser.c_ast.Compound)) + ) return s - def _parenthesize_if(self, n: Node, condition: typing.Callable[[Node], bool]) -> str: + def _parenthesize_if( + self, n: Node, condition: typing.Callable[[Node], bool] + ) -> str: self.indent_level += 2 s = self._visit_expr(n) self.indent_level -= 2 @@ -97,7 +132,7 @@ def _parenthesize_if(self, n: Node, condition: typing.Callable[[Node], bool]) -> if isinstance(n, pycparser.c_ast.Compound): return "(\n" + s + self._make_indent() + ")" else: - return '(' + s + ')' + return "(" + s + ")" else: return s @@ -124,9 +159,9 @@ def define_var(var_type: Node, var_name: str, value: Node) -> Decl: ) -void = IdentifierType(names=['void']) +void = IdentifierType(names=["void"]) -c_ast_int = IdentifierType(names=['int']) +c_ast_int = IdentifierType(names=["int"]) def ptr_type(type: Node) -> pycparser.c_ast.PtrDecl: @@ -179,14 +214,19 @@ def from_decl(decl: Decl) -> ParsedFunc: if isinstance(param_decl, Decl) ), return_type=expect_type(FuncDecl, decl.type).type, - variadic=isinstance(expect_type(FuncDecl, decl.type).args.params[-1], pycparser.c_ast.EllipsisParam), + variadic=isinstance( + expect_type(FuncDecl, decl.type).args.params[-1], + pycparser.c_ast.EllipsisParam, + ), ) @staticmethod def from_defn(func_def: pycparser.c_ast.FuncDef) -> ParsedFunc: return dataclasses.replace( ParsedFunc.from_decl(func_def.decl), - stmts=tuple(func_def.body.block_items) if func_def.body.block_items is not None else (), + stmts=tuple(func_def.body.block_items) + if func_def.body.block_items is not None + else (), ) def declaration(self) -> pycparser.c_ast.FuncDecl: @@ -204,7 +244,8 @@ def declaration(self) -> pycparser.c_ast.FuncDecl: bitsize=None, ) for param_name, param_type in self.params - ] + ([pycparser.c_ast.EllipsisParam()] if self.variadic else []), + ] + + ([pycparser.c_ast.EllipsisParam()] if self.variadic else []), ), type=pycparser.c_ast.TypeDecl( declname=self.name, @@ -224,7 +265,7 @@ def definition(self) -> pycparser.c_ast.FuncDef: funcspec=[], type=self.declaration(), init=None, - bitsize=None + bitsize=None, ), param_decls=None, body=pycparser.c_ast.Compound( @@ -243,9 +284,13 @@ def definition(self) -> pycparser.c_ast.FuncDef: funcs = { **orig_funcs, **{ - node.name: dataclasses.replace(orig_funcs[typing.cast(ID, node.init).name], name=node.name) + node.name: dataclasses.replace( + orig_funcs[typing.cast(ID, node.init).name], name=node.name + ) for node in ast.ext - if isinstance(node, Decl) and isinstance(node.type, pycparser.c_ast.TypeDecl) and node.type.type.names == ["fn"] + if isinstance(node, Decl) + and isinstance(node.type, pycparser.c_ast.TypeDecl) + and node.type.type.names == ["fn"] }, } # funcs = { @@ -276,14 +321,16 @@ def definition(self) -> pycparser.c_ast.FuncDef: variadic=False, stmts=[ Assignment( - op='=', + op="=", lvalue=pycparser.c_ast.ID(name=func_prefix + func_name), rvalue=pycparser.c_ast.FuncCall( name=pycparser.c_ast.ID(name="dlsym"), args=pycparser.c_ast.ExprList( exprs=[ pycparser.c_ast.ID(name="RTLD_NEXT"), - pycparser.c_ast.Constant(type="string", value='"' + func_name + '"'), + pycparser.c_ast.Constant( + type="string", value='"' + func_name + '"' + ), ], ), ), @@ -294,6 +341,8 @@ def definition(self) -> pycparser.c_ast.FuncDef: T = typing.TypeVar("T") + + def raise_(exception: Exception) -> typing.NoReturn: raise exception @@ -303,19 +352,21 @@ def raise_thunk(exception: Exception) -> typing.Callable[..., typing.NoReturn]: def find_decl( - block: typing.Sequence[Node], - name: str, - comment: typing.Any, + block: typing.Sequence[Node], + name: str, + comment: typing.Any, ) -> Decl | None: relevant_stmts = [ - stmt - for stmt in block - if isinstance(stmt, Decl) and stmt.name == name + stmt for stmt in block if isinstance(stmt, Decl) and stmt.name == name ] if not relevant_stmts: return None elif len(relevant_stmts) > 1: - raise ValueError(f"Multiple definitions of {name}" + " ({})".format(comment) if comment else "") + raise ValueError( + f"Multiple definitions of {name}" + " ({})".format(comment) + if comment + else "" + ) else: return relevant_stmts[0] @@ -326,6 +377,16 @@ def wrapper_func_body(func: ParsedFunc) -> typing.Sequence[Node]: name=pycparser.c_ast.ID(name="maybe_init_thread"), args=pycparser.c_ast.ExprList(exprs=[]), ), + pycparser.c_ast.FuncCall( + name=pycparser.c_ast.ID(name="DEBUG"), + args=pycparser.c_ast.ExprList( + exprs=[ + pycparser.c_ast.Constant( + type="string", value='"' + func.name + '(...)"' + ), + ] + ), + ), ] post_call_stmts = [] @@ -355,9 +416,12 @@ def wrapper_func_body(func: ParsedFunc) -> typing.Sequence[Node]: exprs=[ pycparser.c_ast.Cast( to_type=void_fn_ptr, - expr=pycparser.c_ast.ID(name=func_prefix + func.name) + expr=pycparser.c_ast.ID(name=func_prefix + func.name), + ), + pycparser.c_ast.FuncCall( + name=pycparser.c_ast.ID(name="__builtin_apply_args"), + args=None, ), - pycparser.c_ast.FuncCall(name=pycparser.c_ast.ID(name="__builtin_apply_args"), args=None), pycparser.c_ast.ID(name="varargs_size"), ], ), @@ -365,17 +429,19 @@ def wrapper_func_body(func: ParsedFunc) -> typing.Sequence[Node]: if is_void(func.return_type): call_stmts = [uncasted_func_call] else: - call_stmts = [define_var( - func.return_type, - "ret", - pycparser.c_ast.UnaryOp( - op="*", - expr=pycparser.c_ast.Cast( - to_type=ptr_type(func.return_type), - expr=uncasted_func_call, + call_stmts = [ + define_var( + func.return_type, + "ret", + pycparser.c_ast.UnaryOp( + op="*", + expr=pycparser.c_ast.Cast( + to_type=ptr_type(func.return_type), + expr=uncasted_func_call, + ), ), - ), - )] + ) + ] else: call_expr = pycparser.c_ast.FuncCall( name=pycparser.c_ast.ID( @@ -397,7 +463,7 @@ def wrapper_func_body(func: ParsedFunc) -> typing.Sequence[Node]: save_errno = define_var(c_ast_int, "saved_errno", pycparser.c_ast.ID(name="errno")) restore_errno = Assignment( - op='=', + op="=", lvalue=pycparser.c_ast.ID(name="errno"), rvalue=pycparser.c_ast.ID(name="saved_errno"), ) @@ -421,18 +487,24 @@ def wrapper_func_body(func: ParsedFunc) -> typing.Sequence[Node]: ).definition() for _, func in funcs.items() ] -pathlib.Path("generated/libc_hooks.h").write_text( +generated = pathlib.Path("generated") +generated.mkdir(exist_ok=True) +(generated / "libc_hooks.h").write_text( GccCGenerator().visit( - pycparser.c_ast.FileAST(ext=[ - *func_pointer_declarations, - ]) + pycparser.c_ast.FileAST( + ext=[ + *func_pointer_declarations, + ] + ) ) ) -pathlib.Path("generated/libc_hooks.c").write_text( +(generated / "libc_hooks.c").write_text( GccCGenerator().visit( - pycparser.c_ast.FileAST(ext=[ - init_function_pointers, - *static_args_wrapper_func_declarations, - ]) + pycparser.c_ast.FileAST( + ext=[ + init_function_pointers, + *static_args_wrapper_func_declarations, + ] + ) ) ) diff --git a/probe_src/libprobe/generator/libc_hooks_source.c b/libprobe/generator/libc_hooks_source.c similarity index 100% rename from probe_src/libprobe/generator/libc_hooks_source.c rename to libprobe/generator/libc_hooks_source.c diff --git a/probe_src/libprobe/include/libprobe/prov_ops.h b/libprobe/include/libprobe/prov_ops.h similarity index 100% rename from probe_src/libprobe/include/libprobe/prov_ops.h rename to libprobe/include/libprobe/prov_ops.h diff --git a/probe_src/libprobe/src/global_state.c b/libprobe/src/global_state.c similarity index 100% rename from probe_src/libprobe/src/global_state.c rename to libprobe/src/global_state.c diff --git a/probe_src/libprobe/src/inode_table.c b/libprobe/src/inode_table.c similarity index 100% rename from probe_src/libprobe/src/inode_table.c rename to libprobe/src/inode_table.c diff --git a/probe_src/libprobe/src/lib.c b/libprobe/src/lib.c similarity index 100% rename from probe_src/libprobe/src/lib.c rename to libprobe/src/lib.c diff --git a/probe_src/libprobe/src/lookup_on_path.c b/libprobe/src/lookup_on_path.c similarity index 88% rename from probe_src/libprobe/src/lookup_on_path.c rename to libprobe/src/lookup_on_path.c index 9d337607..5ee7b684 100644 --- a/probe_src/libprobe/src/lookup_on_path.c +++ b/libprobe/src/lookup_on_path.c @@ -11,9 +11,7 @@ static bool lookup_on_path(BORROWED const char* bin_name, BORROWED char* bin_pat * * -- https://man7.org/linux/man-pages/man3/exec.3.html */ - char* path = env_path ? env_path : get_default_path(); - /* Note that strtok_r is destructive, so we will have to copy this. */ - path = strndup(path, sysconf(_SC_ARG_MAX)); + char* path = strndup(env_path ? env_path : get_default_path(), sysconf(_SC_ARG_MAX)); DEBUG("looking up \"%s\" on $PATH=\"%.50s...\"", bin_name, path); diff --git a/probe_src/libprobe/src/prov_buffer.c b/libprobe/src/prov_buffer.c similarity index 100% rename from probe_src/libprobe/src/prov_buffer.c rename to libprobe/src/prov_buffer.c diff --git a/probe_src/libprobe/src/prov_enable.c b/libprobe/src/prov_enable.c similarity index 100% rename from probe_src/libprobe/src/prov_enable.c rename to libprobe/src/prov_enable.c diff --git a/probe_src/libprobe/src/prov_ops.c b/libprobe/src/prov_ops.c similarity index 100% rename from probe_src/libprobe/src/prov_ops.c rename to libprobe/src/prov_ops.c diff --git a/probe_src/libprobe/src/util.c b/libprobe/src/util.c similarity index 100% rename from probe_src/libprobe/src/util.c rename to libprobe/src/util.c diff --git a/lightweight_env.sh b/lightweight_env.sh deleted file mode 100755 index bb4fd861..00000000 --- a/lightweight_env.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -# nix develop brings in a ton of stuff to the env -# which complicates testing probe -# To simplify, use this script. - -env - __PROBE_LIB=$__PROBE_LIB PATH=$PATH PYTHONPATH=$PYTHONPATH $@ diff --git a/output.png b/output.png deleted file mode 100644 index ab90c424..00000000 Binary files a/output.png and /dev/null differ diff --git a/probe_py/.gitignore b/probe_py/.gitignore new file mode 100644 index 00000000..69e0e2ac --- /dev/null +++ b/probe_py/.gitignore @@ -0,0 +1,2 @@ +# Generated files +ops.py diff --git a/probe_src/python/probe_py/manual/__init__.py b/probe_py/probe_py/__init__.py similarity index 100% rename from probe_src/python/probe_py/manual/__init__.py rename to probe_py/probe_py/__init__.py diff --git a/probe_src/python/probe_py/manual/analysis.py b/probe_py/probe_py/analysis.py similarity index 67% rename from probe_src/python/probe_py/manual/analysis.py rename to probe_py/probe_py/analysis.py index d63c9ac6..5c90a864 100644 --- a/probe_src/python/probe_py/manual/analysis.py +++ b/probe_py/probe_py/analysis.py @@ -1,8 +1,19 @@ import typing -from typing import Dict, Tuple import networkx as nx # type: ignore -from probe_py.generated.ops import Op, CloneOp, ExecOp, WaitOp, OpenOp, CloseOp, InitProcessOp, InitExecEpochOp, InitThreadOp, StatOp -from probe_py.generated import parser +from .ptypes import TaskType, ProvLog +from .ops import ( + Op, + CloneOp, + ExecOp, + WaitOp, + OpenOp, + CloseOp, + InitProcessOp, + InitExecEpochOp, + InitThreadOp, + StatOp, +) +from .graph_utils import list_edges_from_start_node from enum import IntEnum import rich import sys @@ -11,24 +22,19 @@ import os import collections -# TODO: implement this in probe_py.generated.ops -class TaskType(IntEnum): - TASK_PID = 0 - TASK_TID = 1 - TASK_ISO_C_THREAD = 2 - TASK_PTHREAD = 3 - class EdgeLabels(IntEnum): PROGRAM_ORDER = 1 FORK_JOIN = 2 EXEC = 3 - + + @dataclass(frozen=True) class ProcessNode: pid: int - cmd: tuple[str,...] - + cmd: tuple[str, ...] + + @dataclass(frozen=True) class InodeOnDevice: device_major: int @@ -38,30 +44,36 @@ class InodeOnDevice: def __eq__(self, other: object) -> bool: if not isinstance(other, InodeOnDevice): return NotImplemented - return (self.device_major == other.device_major and - self.device_minor == other.device_minor and - self.inode == other.inode) + return ( + self.device_major == other.device_major + and self.device_minor == other.device_minor + and self.inode == other.inode + ) + def __hash__(self) -> int: return hash((self.device_major, self.device_minor, self.inode)) + @dataclass(frozen=True) class FileNode: inodeOnDevice: InodeOnDevice - version: Tuple[int, int] + version: tuple[int, int] file: str @property def label(self) -> str: return f"{self.file} version(inode {self.version[0]} mtime {self.version[1]})" + # type alias for a node -Node = Tuple[int, int, int, int] +Node: typing.TypeAlias = tuple[int, int, int, int] # type for the edges -EdgeType = Tuple[Node, Node] +EdgeType: typing.TypeAlias = tuple[Node, Node] + def validate_provlog( - provlog: parser.ProvLog, + provlog: ProvLog, ) -> list[str]: ret = list[str]() waited_processes = set[tuple[TaskType, int]]() @@ -97,13 +109,19 @@ def validate_provlog( first_thread_op_idx = first_ee_op_idx + (1 if tid == pid else 0) first_thread_op = thread.ops[first_thread_op_idx] if not isinstance(first_thread_op.data, InitThreadOp): - ret.append(f"{first_thread_op_idx} in exec_epoch should be InitThreadOp") + ret.append( + f"{first_thread_op_idx} in exec_epoch should be InitThreadOp" + ) for op in thread.ops: if isinstance(op.data, WaitOp) and op.data.ferrno == 0: # TODO: Replace TaskType(x) with x in this file, once Rust can emit enums - waited_processes.add((TaskType(op.data.task_type), op.data.task_id)) + waited_processes.add( + (TaskType(op.data.task_type), op.data.task_id) + ) elif isinstance(op.data, CloneOp) and op.data.ferrno == 0: - cloned_processes.add((TaskType(op.data.task_type), op.data.task_id)) + cloned_processes.add( + (TaskType(op.data.task_type), op.data.task_id) + ) if op.data.task_type == TaskType.TASK_PID: # New process implicitly also creates a new thread cloned_processes.add((TaskType.TASK_TID, op.data.task_id)) @@ -122,27 +140,53 @@ def validate_provlog( elif isinstance(op.data, CloneOp) and op.data.ferrno == 0: if False: pass - elif op.data.task_type == TaskType.TASK_PID and op.data.task_id not in provlog.processes.keys(): - ret.append(f"CloneOp returned a PID {op.data.task_id} that we didn't track") - elif op.data.task_type == TaskType.TASK_TID and op.data.task_id not in exec_epoch.threads.keys(): - ret.append(f"CloneOp returned a TID {op.data.task_id} that we didn't track") - elif op.data.task_type == TaskType.TASK_PTHREAD and op.data.task_id not in pthread_ids: - ret.append(f"CloneOp returned a pthread ID {op.data.task_id} that we didn't track") - elif op.data.task_type == TaskType.TASK_ISO_C_THREAD and op.data.task_id not in iso_c_thread_ids: - ret.append(f"CloneOp returned a ISO C Thread ID {op.data.task_id} that we didn't track") + elif ( + op.data.task_type == TaskType.TASK_PID + and op.data.task_id not in provlog.processes.keys() + ): + ret.append( + f"CloneOp returned a PID {op.data.task_id} that we didn't track" + ) + elif ( + op.data.task_type == TaskType.TASK_TID + and op.data.task_id not in exec_epoch.threads.keys() + ): + ret.append( + f"CloneOp returned a TID {op.data.task_id} that we didn't track" + ) + elif ( + op.data.task_type == TaskType.TASK_PTHREAD + and op.data.task_id not in pthread_ids + ): + ret.append( + f"CloneOp returned a pthread ID {op.data.task_id} that we didn't track" + ) + elif ( + op.data.task_type == TaskType.TASK_ISO_C_THREAD + and op.data.task_id not in iso_c_thread_ids + ): + ret.append( + f"CloneOp returned a ISO C Thread ID {op.data.task_id} that we didn't track" + ) elif isinstance(op.data, InitProcessOp): if exec_epoch_no != 0: - ret.append(f"InitProcessOp happened, but exec_epoch was not zero, was {exec_epoch_no}") + ret.append( + f"InitProcessOp happened, but exec_epoch was not zero, was {exec_epoch_no}" + ) expected_epochs = set(range(0, max(epochs) + 1)) if expected_epochs - epochs: - ret.append(f"Missing epochs for pid={pid}: {sorted(epochs - expected_epochs)}") + ret.append( + f"Missing epochs for pid={pid}: {sorted(epochs - expected_epochs)}" + ) reserved_fds = {0, 1, 2} if closed_fds - opened_fds - reserved_fds: # TODO: Problem due to some programs opening /dev/pts/0 in a way that libprobe doesn't notice, but they close it in a way we do notice. pass - #ret.append(f"Closed more fds than we opened: {closed_fds=} {reserved_fds=} {opened_fds=}") + # ret.append(f"Closed more fds than we opened: {closed_fds=} {reserved_fds=} {opened_fds=}") elif waited_processes - cloned_processes: - ret.append(f"Waited on more processes than we cloned: {waited_processes=} {cloned_processes=}") + ret.append( + f"Waited on more processes than we cloned: {waited_processes=} {cloned_processes=}" + ) if n_roots != 1: ret.append(f"Got {n_roots} prov roots") return ret @@ -151,7 +195,7 @@ def validate_provlog( # TODO: Rename "digraph" to "hb_graph" in the entire project. # Digraph (aka "directed graph") is too vague a term; the proper name is "happens-before graph". # Later on, we will have a function that transforms an hb graph to file graph (both of which are digraphs) -def provlog_to_digraph(process_tree_prov_log: parser.ProvLog) -> nx.DiGraph: +def provlog_to_digraph(process_tree_prov_log: ProvLog) -> nx.DiGraph: # [pid, exec_epoch_no, tid, op_index] program_order_edges = list[tuple[Node, Node]]() fork_join_edges = list[tuple[Node, Node]]() @@ -173,7 +217,7 @@ def provlog_to_digraph(process_tree_prov_log: parser.ProvLog) -> nx.DiGraph: ops.append((*context, op_index)) # Add just those ops to the graph nodes.extend(ops) - program_order_edges.extend(zip(ops[:-1], ops[1:])) + program_order_edges.extend(zip(ops[:-1], ops[1:])) # Store these so we can hook up forks/joins between threads proc_to_ops[context] = ops @@ -209,7 +253,13 @@ def get_last_pthread(pid: int, exid: int, target_pthread_id: int) -> list[Node]: # Hook up forks/joins for node in list(nodes): pid, exid, tid, op_index = node - op_data = process_tree_prov_log.processes[pid].exec_epochs[exid].threads[tid].ops[op_index].data + op_data = ( + process_tree_prov_log.processes[pid] + .exec_epochs[exid] + .threads[tid] + .ops[op_index] + .data + ) target: tuple[int, int, int] if False: pass @@ -228,11 +278,17 @@ def get_last_pthread(pid: int, exid: int, target_pthread_id: int) -> list[Node]: fork_join_edges.append((node, dest)) else: raise RuntimeError(f"Task type {op_data.task_type} supported") - elif isinstance(op_data, WaitOp) and op_data.ferrno == 0 and op_data.task_id > 0: + elif ( + isinstance(op_data, WaitOp) and op_data.ferrno == 0 and op_data.task_id > 0 + ): if False: pass elif op_data.task_type == TaskType.TASK_PID: - target = (op_data.task_id, last_exec_epoch.get(op_data.task_id, 0), op_data.task_id) + target = ( + op_data.task_id, + last_exec_epoch.get(op_data.task_id, 0), + op_data.task_id, + ) fork_join_edges.append((last(*target), node)) elif op_data.task_type == TaskType.TASK_TID: target = (pid, exid, op_data.task_id) @@ -249,41 +305,60 @@ def get_last_pthread(pid: int, exid: int, target_pthread_id: int) -> list[Node]: for node in nodes: process_graph.add_node(node) - def add_edges(edges:list[tuple[Node, Node]], label:EdgeLabels) -> None: + def add_edges(edges: list[tuple[Node, Node]], label: EdgeLabels) -> None: for node0, node1 in edges: process_graph.add_edge(node0, node1, label=label) - + add_edges(program_order_edges, EdgeLabels.PROGRAM_ORDER) add_edges(exec_edges, EdgeLabels.EXEC) add_edges(fork_join_edges, EdgeLabels.FORK_JOIN) return process_graph -def traverse_hb_for_dfgraph(process_tree_prov_log: parser.ProvLog, starting_node: Node, traversed: set[int] , dataflow_graph:nx.DiGraph, cmd_map: Dict[int, list[str]]) -> None: + +def traverse_hb_for_dfgraph( + process_tree_prov_log: ProvLog, + starting_node: Node, + traversed: set[int], + dataflow_graph: nx.DiGraph, + cmd_map: dict[int, list[str]], +) -> None: starting_pid = starting_node[0] - - starting_op = prov_log_get_node(process_tree_prov_log, starting_node[0], starting_node[1], starting_node[2], starting_node[3]) + + starting_op = prov_log_get_node( + process_tree_prov_log, + starting_node[0], + starting_node[1], + starting_node[2], + starting_node[3], + ) process_graph = provlog_to_digraph(process_tree_prov_log) - + edges = list_edges_from_start_node(process_graph, starting_node) name_map = collections.defaultdict[InodeOnDevice, list[pathlib.Path]](list) target_nodes = collections.defaultdict[int, list[Node]](list) console = rich.console.Console(file=sys.stderr) - - for edge in edges: + + for edge in edges: pid, exec_epoch_no, tid, op_index = edge[0] - + # check if the process is already visited when waitOp occurred if pid in traversed or tid in traversed: continue - - op = prov_log_get_node(process_tree_prov_log, pid, exec_epoch_no, tid, op_index).data - next_op = prov_log_get_node(process_tree_prov_log, edge[1][0], edge[1][1], edge[1][2], edge[1][3]).data + + op = prov_log_get_node( + process_tree_prov_log, pid, exec_epoch_no, tid, op_index + ).data + next_op = prov_log_get_node( + process_tree_prov_log, edge[1][0], edge[1][1], edge[1][2], edge[1][3] + ).data if isinstance(op, OpenOp): access_mode = op.flags & os.O_ACCMODE processNode = ProcessNode(pid=pid, cmd=tuple(cmd_map[pid])) dataflow_graph.add_node(processNode, label=processNode.cmd) - file = InodeOnDevice(op.path.device_major, op.path.device_minor, op.path.inode) + file = InodeOnDevice( + op.path.device_major, op.path.device_minor, op.path.inode + ) path_str = op.path.path.decode("utf-8") curr_version = (op.path.inode, op.path.mtime.sec) fileNode = FileNode(file, curr_version, path_str) @@ -296,7 +371,9 @@ def traverse_hb_for_dfgraph(process_tree_prov_log: parser.ProvLog, starting_node elif access_mode == os.O_WRONLY: dataflow_graph.add_edge(processNode, fileNode) elif access_mode == 2: - console.print(f"Found file {path_str} with access mode O_RDWR", style="red") + console.print( + f"Found file {path_str} with access mode O_RDWR", style="red" + ) else: raise Exception("unknown access mode") elif isinstance(op, CloneOp): @@ -308,49 +385,68 @@ def traverse_hb_for_dfgraph(process_tree_prov_log: parser.ProvLog, starting_node if edge[0][2] != edge[1][2]: target_nodes[op.task_id].append(edge[1]) continue - if op.task_type != TaskType.TASK_PTHREAD and op.task_type != TaskType.TASK_ISO_C_THREAD: - - processNode1 = ProcessNode(pid = pid, cmd=tuple(cmd_map[pid])) - processNode2 = ProcessNode(pid = op.task_id, cmd=tuple(cmd_map[op.task_id])) - dataflow_graph.add_node(processNode1, label = " ".join(arg for arg in processNode1.cmd)) - dataflow_graph.add_node(processNode2, label = " ".join(arg for arg in processNode2.cmd)) + if ( + op.task_type != TaskType.TASK_PTHREAD + and op.task_type != TaskType.TASK_ISO_C_THREAD + ): + processNode1 = ProcessNode(pid=pid, cmd=tuple(cmd_map[pid])) + processNode2 = ProcessNode( + pid=op.task_id, cmd=tuple(cmd_map[op.task_id]) + ) + dataflow_graph.add_node( + processNode1, label=" ".join(arg for arg in processNode1.cmd) + ) + dataflow_graph.add_node( + processNode2, label=" ".join(arg for arg in processNode2.cmd) + ) dataflow_graph.add_edge(processNode1, processNode2) target_nodes[op.task_id] = list() elif isinstance(op, WaitOp) and op.options == 0: for node in target_nodes[op.task_id]: - traverse_hb_for_dfgraph(process_tree_prov_log, node, traversed, dataflow_graph, cmd_map) + traverse_hb_for_dfgraph( + process_tree_prov_log, node, traversed, dataflow_graph, cmd_map + ) traversed.add(node[2]) # return back to the WaitOp of the parent process if isinstance(next_op, WaitOp): - if next_op.task_id == starting_pid or next_op.task_id == starting_op.pthread_id: + if ( + next_op.task_id == starting_pid + or next_op.task_id == starting_op.pthread_id + ): return -def list_edges_from_start_node(graph: nx.DiGraph, start_node: Node) -> list[EdgeType]: - all_edges = list(graph.edges()) - start_index = next(i for i, edge in enumerate(all_edges) if edge[0] == start_node) - ordered_edges = all_edges[start_index:] + all_edges[:start_index] - return ordered_edges -def provlog_to_dataflow_graph(process_tree_prov_log: parser.ProvLog) -> nx.DiGraph: +def provlog_to_dataflow_graph(process_tree_prov_log: ProvLog) -> nx.DiGraph: dataflow_graph = nx.DiGraph() process_graph = provlog_to_digraph(process_tree_prov_log) - root_node = [n for n in process_graph.nodes() if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0][0] + root_node = [ + n + for n in process_graph.nodes() + if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0 + ][0] traversed: set[int] = set() cmd_map = collections.defaultdict[int, list[str]](list) for edge in list(nx.edges(process_graph))[::-1]: pid, exec_epoch_no, tid, op_index = edge[0] - op = prov_log_get_node(process_tree_prov_log, pid, exec_epoch_no, tid, op_index).data + op = prov_log_get_node( + process_tree_prov_log, pid, exec_epoch_no, tid, op_index + ).data if isinstance(op, ExecOp): if pid == tid and exec_epoch_no == 0: cmd_map[tid] = [arg.decode(errors="surrogate") for arg in op.argv] - traverse_hb_for_dfgraph(process_tree_prov_log, root_node, traversed, dataflow_graph, cmd_map) + traverse_hb_for_dfgraph( + process_tree_prov_log, root_node, traversed, dataflow_graph, cmd_map + ) return dataflow_graph -def prov_log_get_node(prov_log: parser.ProvLog, pid: int, exec_epoch: int, tid: int, op_no: int) -> Op: + +def prov_log_get_node( + prov_log: ProvLog, pid: int, exec_epoch: int, tid: int, op_no: int +) -> Op: return prov_log.processes[pid].exec_epochs[exec_epoch].threads[tid].ops[op_no] -def validate_hb_closes(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: +def validate_hb_closes(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: # Note that this test doesn't work if a process "intentionally" leaves a fd open for its child. # E.g., bash-in-pipe provlog_reverse = process_graph.reverse() @@ -363,14 +459,20 @@ def validate_hb_closes(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> li if closed_fd not in reserved_fds: for pred_node in nx.dfs_preorder_nodes(provlog_reverse, node): pred_op = prov_log_get_node(provlog, *pred_node) - if isinstance(pred_op.data, OpenOp) and pred_op.data.fd == closed_fd and op.data.ferrno == 0: + if ( + isinstance(pred_op.data, OpenOp) + and pred_op.data.fd == closed_fd + and op.data.ferrno == 0 + ): break else: - ret.append(f"Close of {closed_fd} in {node} is not preceeded by corresponding open") + ret.append( + f"Close of {closed_fd} in {node} is not preceeded by corresponding open" + ) return ret -def validate_hb_waits(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: +def validate_hb_waits(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: provlog_reverse = process_graph.reverse() ret = list[str]() for node in process_graph.nodes: @@ -379,13 +481,21 @@ def validate_hb_waits(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> lis for pred_node in nx.dfs_preorder_nodes(provlog_reverse, node): pred_op = prov_log_get_node(provlog, *pred_node) pid1, eid1, tid1, opid1 = pred_node - if isinstance(pred_op.data, CloneOp) and pred_op.data.task_type == op.data.task_type and pred_op.data.task_id == op.data.task_id and op.data.ferrno == 0: + if ( + isinstance(pred_op.data, CloneOp) + and pred_op.data.task_type == op.data.task_type + and pred_op.data.task_id == op.data.task_id + and op.data.ferrno == 0 + ): break else: - ret.append(f"Wait of {op.data.task_id} in {node} is not preceeded by corresponding clone") + ret.append( + f"Wait of {op.data.task_id} in {node} is not preceeded by corresponding clone" + ) return ret -def validate_hb_clones(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: + +def validate_hb_clones(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: ret = list[str]() for node in process_graph.nodes: op = prov_log_get_node(provlog, *node) @@ -398,23 +508,35 @@ def validate_hb_clones(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> li elif op.data.task_type == TaskType.TASK_PID: if isinstance(op1.data, InitProcessOp): if op.data.task_id != pid1: - ret.append(f"CloneOp {node} returns {op.data.task_id} but the next op has pid {pid1}") + ret.append( + f"CloneOp {node} returns {op.data.task_id} but the next op has pid {pid1}" + ) break elif op.data.task_type == TaskType.TASK_TID: if isinstance(op1.data, InitThreadOp): if op.data.task_id != tid1: - ret.append(f"CloneOp {node} returns {op.data.task_id} but the next op has tid {tid1}") + ret.append( + f"CloneOp {node} returns {op.data.task_id} but the next op has tid {tid1}" + ) break - elif op.data.task_type == TaskType.TASK_PTHREAD and op.data.task_id == op1.pthread_id: + elif ( + op.data.task_type == TaskType.TASK_PTHREAD + and op.data.task_id == op1.pthread_id + ): break - elif op.data.task_type == TaskType.TASK_ISO_C_THREAD and op.data.task_id == op1.iso_c_thread_id: + elif ( + op.data.task_type == TaskType.TASK_ISO_C_THREAD + and op.data.task_id == op1.iso_c_thread_id + ): break else: - ret.append(f"Could not find a successor for CloneOp {node} {TaskType(op.data.task_type).name} in the target thread/process/whatever") + ret.append( + f"Could not find a successor for CloneOp {node} {TaskType(op.data.task_type).name} in the target thread/process/whatever" + ) return ret -def validate_hb_degree(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: +def validate_hb_degree(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: ret = list[str]() found_entry = False found_exit = False @@ -436,7 +558,7 @@ def validate_hb_degree(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> li return ret -def validate_hb_acyclic(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: +def validate_hb_acyclic(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: try: cycle = nx.find_cycle(process_graph) except nx.NetworkXNoCycle: @@ -445,7 +567,7 @@ def validate_hb_acyclic(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> l return [f"Cycle detected: {cycle}"] -def validate_hb_execs(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> list[str]: +def validate_hb_execs(provlog: ProvLog, process_graph: nx.DiGraph) -> list[str]: ret = list[str]() for node0 in process_graph.nodes(): pid0, eid0, tid0, op0 = node0 @@ -456,14 +578,18 @@ def validate_hb_execs(provlog: parser.ProvLog, process_graph: nx.DiGraph) -> lis op1 = prov_log_get_node(provlog, *node1) if isinstance(op1.data, InitExecEpochOp): if eid0 + 1 != eid1: - ret.append(f"ExecOp {node0} is followed by {node1}, whose exec epoch id should be {eid0 + 1}") + ret.append( + f"ExecOp {node0} is followed by {node1}, whose exec epoch id should be {eid0 + 1}" + ) break else: - ret.append(f"ExecOp {node0} is not followed by an InitExecEpochOp, but by {op1}.") + ret.append( + f"ExecOp {node0} is not followed by an InitExecEpochOp, but by {op1}." + ) return ret -def validate_hb_graph(processes: parser.ProvLog, hb_graph: nx.DiGraph) -> list[str]: +def validate_hb_graph(processes: ProvLog, hb_graph: nx.DiGraph) -> list[str]: ret = list[str]() # ret.extend(validate_hb_closes(processes, hb_graph)) ret.extend(validate_hb_waits(processes, hb_graph)) @@ -474,7 +600,9 @@ def validate_hb_graph(processes: parser.ProvLog, hb_graph: nx.DiGraph) -> list[s return ret -def relax_node(graph: nx.DiGraph, node: typing.Any) -> list[tuple[typing.Any, typing.Any]]: +def relax_node( + graph: nx.DiGraph, node: typing.Any +) -> list[tuple[typing.Any, typing.Any]]: """Remove node from graph and attach its predecessors to its successors""" ret = list[tuple[typing.Any, typing.Any]]() for predecessor in graph.predecessors: @@ -484,17 +612,18 @@ def relax_node(graph: nx.DiGraph, node: typing.Any) -> list[tuple[typing.Any, ty graph.remove_node(node) return ret -def color_hb_graph(prov_log: parser.ProvLog, process_graph: nx.DiGraph) -> None: + +def color_hb_graph(prov_log: ProvLog, process_graph: nx.DiGraph) -> None: label_color_map = { - EdgeLabels.EXEC: 'yellow', - EdgeLabels.FORK_JOIN: 'red', - EdgeLabels.PROGRAM_ORDER: 'green', + EdgeLabels.EXEC: "yellow", + EdgeLabels.FORK_JOIN: "red", + EdgeLabels.PROGRAM_ORDER: "green", } for node0, node1, attrs in process_graph.edges(data=True): - label: EdgeLabels = attrs['label'] - process_graph[node0][node1]['color'] = label_color_map[label] - del attrs['label'] + label: EdgeLabels = attrs["label"] + process_graph[node0][node1]["color"] = label_color_map[label] + del attrs["label"] for node, data in process_graph.nodes(data=True): pid, exid, tid, op_no = node @@ -515,9 +644,10 @@ def color_hb_graph(prov_log: parser.ProvLog, process_graph: nx.DiGraph) -> None: elif isinstance(op.data, StatOp): data["label"] += f"\n{op.data.path.path.decode()}" -def provlog_to_process_tree(prov_log: parser.ProvLog) -> nx.DiGraph: + +def provlog_to_process_tree(prov_log: ProvLog) -> nx.DiGraph: process_tree = collections.defaultdict(list) - + for pid, process in prov_log.processes.items(): for exec_epoch_no, exec_epoch in process.exec_epochs.items(): for tid, thread in exec_epoch.threads.items(): diff --git a/probe_src/python/probe_py/manual/cli.py b/probe_py/probe_py/cli.py similarity index 67% rename from probe_src/python/probe_py/manual/cli.py rename to probe_py/probe_py/cli.py index 1abdd4be..1398193f 100644 --- a/probe_src/python/probe_py/manual/cli.py +++ b/probe_py/probe_py/cli.py @@ -3,23 +3,28 @@ import json from typing_extensions import Annotated import pathlib -import typer +import subprocess import shutil import rich -from probe_py.manual.scp import scp_with_provenance +from probe_py.scp import scp_with_provenance +import os +import typer +import tempfile import rich.console import rich.pretty -from ..generated.parser import parse_probe_log, parse_probe_log_ctx +from .parser import parse_probe_log, parse_probe_log_ctx from . import analysis from .workflows import MakefileGenerator, NextflowGenerator -from .ssh_argparser import parse_ssh_args from . import file_closure from . import graph_utils -import subprocess -import os -import tempfile +from .ssh_argparser import parse_ssh_args import enum -from .persistent_provenance_db import Process, ProcessInputs, ProcessThatWrites, get_engine +from .persistent_provenance_db import ( + Process, + ProcessInputs, + ProcessThatWrites, + get_engine, +) from sqlalchemy.orm import Session from .analysis import ProcessNode, FileNode import shlex @@ -37,17 +42,18 @@ app.add_typer(export_app, name="export") - @app.command() def validate( - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), - should_have_files: Annotated[ - bool, - typer.Argument(help="Whether to check that the probe_log was run with --copy-files.") - ] = False, + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), + should_have_files: Annotated[ + bool, + typer.Argument( + help="Whether to check that the probe_log was run with --copy-files." + ), + ] = False, ) -> None: """Sanity-check probe_log and report errors.""" warning_free = True @@ -55,7 +61,9 @@ def validate( for inode, contents in parsed_probe_log.inodes.items(): content_length = contents.stat().st_size if inode.size != content_length: - console.print(f"Blob for {inode} has actual size {content_length}", style="red") + console.print( + f"Blob for {inode} has actual size {content_length}", style="red" + ) warning_free = False # At this point, the inode storage is gone, but the probe_log is already in memory if should_have_files and not parsed_probe_log.has_inodes: @@ -75,14 +83,11 @@ def validate( @export_app.command() def ops_graph( - output: Annotated[ - pathlib.Path, - typer.Argument() - ] = pathlib.Path("ops-graph.png"), - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + output: Annotated[pathlib.Path, typer.Argument()] = pathlib.Path("ops-graph.png"), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Write a happens-before graph on the operations in probe_log. @@ -98,17 +103,16 @@ def ops_graph( analysis.color_hb_graph(prov_log, process_graph) graph_utils.serialize_graph(process_graph, output) - + @export_app.command() def dataflow_graph( - output: Annotated[ - pathlib.Path, - typer.Argument() - ] = pathlib.Path("dataflow-graph.png"), - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + output: Annotated[pathlib.Path, typer.Argument()] = pathlib.Path( + "dataflow-graph.png" + ), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Write a dataflow graph for probe_log. @@ -119,6 +123,7 @@ def dataflow_graph( dataflow_graph = analysis.provlog_to_dataflow_graph(prov_log) graph_utils.serialize_graph(dataflow_graph, output) + def get_host_name() -> int: hostname = socket.gethostname() rng = random.Random(int(datetime.datetime.now().timestamp()) ^ hash(hostname)) @@ -127,11 +132,14 @@ def get_host_name() -> int: random_number = rng.getrandbits(bits_per_hex_digit * hex_digits) return random_number + @export_app.command() -def store_dataflow_graph(probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"))->None: +def store_dataflow_graph( + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), +) -> None: prov_log = parse_probe_log(probe_log) dataflow_graph = analysis.provlog_to_dataflow_graph(prov_log) engine = get_engine() @@ -139,10 +147,15 @@ def store_dataflow_graph(probe_log: Annotated[ for node in dataflow_graph.nodes(): if isinstance(node, ProcessNode): print(node) - new_process = Process(process_id = int(node.pid), parent_process_id = 0, cmd = shlex.join(node.cmd), time = datetime.datetime.now()) + new_process = Process( + process_id=int(node.pid), + parent_process_id=0, + cmd=shlex.join(node.cmd), + time=datetime.datetime.now(), + ) session.add(new_process) - for (node1, node2) in dataflow_graph.edges(): + for node1, node2 in dataflow_graph.edges(): if isinstance(node1, ProcessNode) and isinstance(node2, ProcessNode): parent_process_id = node1.pid child_process = session.get(Process, node2.pid) @@ -155,7 +168,16 @@ def store_dataflow_graph(probe_log: Annotated[ stat_info = os.stat(node2.file) mtime = int(stat_info.st_mtime * 1_000_000_000) size = stat_info.st_size - new_output_inode = ProcessThatWrites(inode = inode_info.inode, process_id = node1.pid, device_major = inode_info.device_major, device_minor = inode_info.device_minor, host = host, path = node2.file, mtime = mtime, size = size) + new_output_inode = ProcessThatWrites( + inode=inode_info.inode, + process_id=node1.pid, + device_major=inode_info.device_major, + device_minor=inode_info.device_minor, + host=host, + path=node2.file, + mtime=mtime, + size=size, + ) session.add(new_output_inode) elif isinstance(node1, FileNode) and isinstance(node2, ProcessNode): @@ -164,7 +186,16 @@ def store_dataflow_graph(probe_log: Annotated[ stat_info = os.stat(node1.file) mtime = int(stat_info.st_mtime * 1_000_000_000) size = stat_info.st_size - new_input_inode = ProcessInputs(inode = inode_info.inode, process_id=node2.pid, device_major=inode_info.device_major, device_minor= inode_info.device_minor, host = host, path = node1.file, mtime=mtime, size=size) + new_input_inode = ProcessInputs( + inode=inode_info.inode, + process_id=node2.pid, + device_major=inode_info.device_major, + device_minor=inode_info.device_minor, + host=host, + path=node1.file, + mtime=mtime, + size=size, + ) session.add(new_input_inode) root_process = None @@ -182,12 +213,13 @@ def store_dataflow_graph(probe_log: Annotated[ session.commit() + @export_app.command() def debug_text( - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Write the data from probe_log in a human-readable manner. @@ -208,16 +240,19 @@ def debug_text( max_string=40, ) for ivl, path in sorted(prov_log.inodes.items()): - out_console.print(f"device={ivl.device_major}.{ivl.device_minor} inode={ivl.inode} mtime={ivl.tv_sec}.{ivl.tv_nsec} -> {ivl.size} blob") + out_console.print( + f"device={ivl.device_major}.{ivl.device_minor} inode={ivl.inode} mtime={ivl.tv_sec}.{ivl.tv_nsec} -> {ivl.size} blob" + ) + @export_app.command() def docker_image( - image_name: str, - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), - verbose: bool = True, + image_name: str, + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), + verbose: bool = True, ) -> None: """Generate a docker image from a probe_log with --copy-files @@ -249,14 +284,15 @@ def docker_image( console, ) + @export_app.command() def oci_image( - image_name: str, - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), - verbose: bool = True, + image_name: str, + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), + verbose: bool = True, ) -> None: """Generate an OCI image from a probe_log with --copy-files @@ -290,8 +326,10 @@ def oci_image( ), ) def ssh( - ssh_args: list[str], - debug: bool = typer.Option(default=False, help="Run verbose & debug build of libprobe"), + ssh_args: list[str], + debug: bool = typer.Option( + default=False, help="Run verbose & debug build of libprobe" + ), ) -> None: """ Wrap SSH and record provenance of the remote command. @@ -301,7 +339,9 @@ def ssh( ssh_cmd = ["ssh"] + flags - libprobe = pathlib.Path(os.environ["__PROBE_LIB"]) / ("libprobe-dbg.so" if debug else "libprobe.so") + libprobe = pathlib.Path(os.environ["__PROBE_LIB"]) / ( + "libprobe-dbg.so" if debug else "libprobe.so" + ) if not libprobe.exists(): typer.secho(f"Libprobe not found at {libprobe}", fg=typer.colors.RED) raise typer.Abort() @@ -320,11 +360,13 @@ def ssh( raise NotImplementedError("Remote platform is different from local platform") # Upload libprobe.so to the remote temporary directory - remote_temp_dir_cmd = ssh_cmd + [destination] + ["mktemp", "-d", "/tmp/probe_log_XXXXXX"] + remote_temp_dir_cmd = ( + ssh_cmd + [destination] + ["mktemp", "-d", "/tmp/probe_log_XXXXXX"] + ) remote_temp_dir = subprocess.check_output(remote_temp_dir_cmd).decode().strip() remote_probe_dir = f"{remote_temp_dir}/probe_dir" - ssh_g = subprocess.run(ssh_cmd + [destination] + ['-G'],stdout=subprocess.PIPE) + ssh_g = subprocess.run(ssh_cmd + [destination] + ["-G"], stdout=subprocess.PIPE) ssh_g_op = ssh_g.stdout.decode().strip().splitlines() ssh_pair = [] @@ -333,15 +375,15 @@ def ssh( scp_cmd = ["scp"] for option in ssh_g_op: - key_value = option.split(' ', 1) + key_value = option.split(" ", 1) if len(key_value) == 2: key, value = key_value scp_cmd.append(f"-o {key}={value}") - scp_args =[str(libprobe),f"{destination}:{remote_temp_dir}"] + scp_args = [str(libprobe), f"{destination}:{remote_temp_dir}"] scp_cmd.extend(scp_args) - subprocess.run(scp_cmd,check=True) + subprocess.run(scp_cmd, check=True) # Prepare the remote command with LD_PRELOAD and __PROBE_DIR ld_preload = f"{remote_temp_dir}/{libprobe.name}" @@ -352,13 +394,23 @@ def ssh( # Download the provenance log from the remote machine remote_tar_file = f"{remote_temp_dir}.tar.gz" - tar_cmd = ssh_cmd + [destination] + ["tar", "-czf", remote_tar_file, "-C", remote_temp_dir, "."] + tar_cmd = ( + ssh_cmd + + [destination] + + ["tar", "-czf", remote_tar_file, "-C", remote_temp_dir, "."] + ) subprocess.run(tar_cmd, check=True) # Download the tarball to the local machine local_tar_file = local_temp_dir / f"{remote_temp_dir.split('/')[-1]}.tar.gz" - scp_download_cmd = ["scp"] + scp_cmd[1:-2] + [f"{destination}:{remote_tar_file}", str(local_tar_file)] - typer.secho(f"PROBE log downloaded at: {scp_download_cmd[-1]}",fg=typer.colors.GREEN) + scp_download_cmd = ( + ["scp"] + + scp_cmd[1:-2] + + [f"{destination}:{remote_tar_file}", str(local_tar_file)] + ) + typer.secho( + f"PROBE log downloaded at: {scp_download_cmd[-1]}", fg=typer.colors.GREEN + ) subprocess.run(scp_download_cmd, check=True) # Clean up the remote temporary directory @@ -370,20 +422,22 @@ def ssh( raise typer.Exit(proc.returncode) + class OutputFormat(str, enum.Enum): makefile = "makefile" nextflow = "nextflow" + @export_app.command() def makefile( - output: Annotated[ - pathlib.Path, - typer.Argument(), - ] = pathlib.Path("Makefile"), - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + output: Annotated[ + pathlib.Path, + typer.Argument(), + ] = pathlib.Path("Makefile"), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Export the probe_log to a Makefile @@ -395,16 +449,17 @@ def makefile( script = g.generate_makefile(dataflow_graph) output.write_text(script) + @export_app.command() def nextflow( - output: Annotated[ - pathlib.Path, - typer.Argument(), - ] = pathlib.Path("nextflow.nf"), - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + output: Annotated[ + pathlib.Path, + typer.Argument(), + ] = pathlib.Path("nextflow.nf"), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Export the probe_log to a Nextflow workflow @@ -416,16 +471,16 @@ def nextflow( script = g.generate_workflow(dataflow_graph) output.write_text(script) + @export_app.command() def provlog_to_process_tree( - output: Annotated[ - pathlib.Path, - typer.Argument() - ] = pathlib.Path("provlog-process-tree.png"), - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + output: Annotated[pathlib.Path, typer.Argument()] = pathlib.Path( + "provlog-process-tree.png" + ), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Write a process tree from probe_log. @@ -439,10 +494,10 @@ def provlog_to_process_tree( @export_app.command() def ops_jsonl( - probe_log: Annotated[ - pathlib.Path, - typer.Argument(help="output file written by `probe record -o $file`."), - ] = pathlib.Path("probe_log"), + probe_log: Annotated[ + pathlib.Path, + typer.Argument(help="output file written by `probe record -o $file`."), + ] = pathlib.Path("probe_log"), ) -> None: """ Export each op to a JSON line. @@ -451,47 +506,54 @@ def ops_jsonl( """ def filter_nested_dict( - dct: typing.Mapping[typing.Any, typing.Any], + dct: typing.Mapping[typing.Any, typing.Any], ) -> typing.Mapping[typing.Any, typing.Any]: """Converts the bytes in a nested dict to a string""" return { key: ( # If dict, Recurse self - filter_nested_dict(val) if isinstance(val, dict) else + filter_nested_dict(val) + if isinstance(val, dict) # If bytes, decode to string - val.decode(errors="surrogateescape") if isinstance(val, bytes) else + else val.decode(errors="surrogateescape") + if isinstance(val, bytes) # Else, do nothing - val + else val ) for key, val in dct.items() } + stdout_console = rich.console.Console() prov_log = parse_probe_log(probe_log) for pid, process in prov_log.processes.items(): for exec_epoch_no, exec_epoch in process.exec_epochs.items(): for tid, thread in exec_epoch.threads.items(): for i, op in enumerate(thread.ops): - stdout_console.print_json(json.dumps({ - "pid": pid, - "tid": tid, - "exec_epoch_no": exec_epoch_no, - "i": i, - "op": filter_nested_dict( - dataclasses.asdict(op), - ), - "op_data_type": type(op.data).__name__, - })) + stdout_console.print_json( + json.dumps( + { + "pid": pid, + "tid": tid, + "exec_epoch_no": exec_epoch_no, + "i": i, + "op": filter_nested_dict( + dataclasses.asdict(op), + ), + "op_data_type": type(op.data).__name__, + } + ) + ) # Example: scp Desktop/sample_example.txt root@136.183.142.28:/home/remote_dir @app.command( -context_settings=dict( + context_settings=dict( ignore_unknown_options=True, ), ) def scp(cmd: list[str]) -> None: scp_with_provenance(cmd) + if __name__ == "__main__": app() - diff --git a/probe_src/python/probe_py/manual/consts.py b/probe_py/probe_py/consts.py similarity index 73% rename from probe_src/python/probe_py/manual/consts.py rename to probe_py/probe_py/consts.py index fbe136df..4250413e 100644 --- a/probe_src/python/probe_py/manual/consts.py +++ b/probe_py/probe_py/consts.py @@ -1,6 +1,9 @@ import typing +import xdg_base_dirs +PROBE_HOME: typing.Final = xdg_base_dirs.xdg_data_home() / "PROBE" + # echo -e '#include \nAT_FDCWD' | gcc -E - | tail --lines=1 AT_FDCWD: typing.Final = -100 diff --git a/probe_src/python/probe_py/manual/file_closure.py b/probe_py/probe_py/file_closure.py similarity index 79% rename from probe_src/python/probe_py/manual/file_closure.py rename to probe_py/probe_py/file_closure.py index 419e1425..6ff81edb 100644 --- a/probe_src/python/probe_py/manual/file_closure.py +++ b/probe_py/probe_py/file_closure.py @@ -9,25 +9,29 @@ import warnings import pathlib import typing -from probe_py.generated.parser import ProvLog, InodeVersionLog -from probe_py.generated.ops import Path, ChdirOp, OpenOp, CloseOp, InitProcessOp, ExecOp +from .ptypes import ProvLog, InodeVersionLog +from .ops import Path, ChdirOp, OpenOp, CloseOp, InitProcessOp, ExecOp from .consts import AT_FDCWD def build_oci_image( - prov_log: ProvLog, - image_name: str, - push_docker: bool, - verbose: bool, - console: rich.console.Console, + prov_log: ProvLog, + image_name: str, + push_docker: bool, + verbose: bool, + console: rich.console.Console, ) -> None: root_pid = get_root_pid(prov_log) if root_pid is None: - console.print("Could not find root process; Are you sure this probe_log is valid?") + console.print( + "Could not find root process; Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) first_op = prov_log.processes[root_pid].exec_epochs[0].threads[root_pid].ops[0].data if not isinstance(first_op, InitProcessOp): - console.print("First op is not InitProcessOp. Are you sure this probe_log is valid?") + console.print( + "First op is not InitProcessOp. Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) with tempfile.TemporaryDirectory() as _tmpdir: tmpdir = pathlib.Path(_tmpdir) @@ -40,7 +44,10 @@ def build_oci_image( ) # TODO: smartly show errors when shelling out to $cmd fails. if not shutil.which("buildah"): - console.print("Buildah not found; should be included in probe-bundled? for other packages, please install Buildah separately", style="red") + console.print( + "Buildah not found; should be included in probe-bundled? for other packages, please install Buildah separately", + style="red", + ) raise typer.Exit(code=1) # Start contianer @@ -57,7 +64,9 @@ def build_oci_image( # Copy relevant files if verbose: - console.print(shlex.join(["buildah", "copy", container_id, str(tmpdir), "/"])) + console.print( + shlex.join(["buildah", "copy", container_id, str(tmpdir), "/"]) + ) subprocess.run( ["buildah", "copy", container_id, str(tmpdir), "/"], check=True, @@ -68,25 +77,29 @@ def build_oci_image( # Set up other config (env, cmd, entrypoint) pid = get_root_pid(prov_log) if pid is None: - console.print("Could not find root process; Are you sure this probe_log is valid?") + console.print( + "Could not find root process; Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) last_op = prov_log.processes[pid].exec_epochs[0].threads[pid].ops[-1].data if not isinstance(last_op, ExecOp): - console.print("Last op is not ExecOp. Are you sure this probe_log is valid?") + console.print( + "Last op is not ExecOp. Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) - args = [ - arg.decode() for arg in last_op.argv - ] + args = [arg.decode() for arg in last_op.argv] env = [] for key_val in last_op.env: if not key_val.startswith(b"LD_PRELOAD="): if b"$" in key_val: # TODO: figure out how to escape money - console.log(f"Skipping {key_val.decode(errors='surrogate')} because $ confuses Buildah.") + console.log( + f"Skipping {key_val.decode(errors='surrogate')} because $ confuses Buildah." + ) continue env.append("--env") env.append(key_val.decode(errors="surrogate")) - #shell = pathlib.Path(os.environ["SHELL"]).resolve() + # shell = pathlib.Path(os.environ["SHELL"]).resolve() cmd = [ "buildah", "config", @@ -131,11 +144,11 @@ def build_oci_image( def copy_file_closure( - prov_log: ProvLog, - destination: pathlib.Path, - copy: bool, - verbose: bool, - console: rich.console.Console, + prov_log: ProvLog, + destination: pathlib.Path, + copy: bool, + verbose: bool, + console: rich.console.Console, ) -> None: """Extract files used by the application recoreded in prov_log to destination @@ -155,11 +168,21 @@ def copy_file_closure( for exec_epoch_no, exec_epoch in process.exec_epochs.items(): root_pid = get_root_pid(prov_log) if root_pid is None: - console.print("Could not find root process; Are you sure this probe_log is valid?") + console.print( + "Could not find root process; Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) - first_op = prov_log.processes[root_pid].exec_epochs[0].threads[root_pid].ops[0].data + first_op = ( + prov_log.processes[root_pid] + .exec_epochs[0] + .threads[root_pid] + .ops[0] + .data + ) if not isinstance(first_op, InitProcessOp): - console.print("First op is not InitProcessOp. Are you sure this probe_log is valid?") + console.print( + "First op is not InitProcessOp. Are you sure this probe_log is valid?" + ) raise typer.Exit(code=1) fds = {AT_FDCWD: pathlib.Path(first_op.cwd.path.decode())} for tid, thread in exec_epoch.threads.items(): @@ -219,19 +242,24 @@ def copy_file_closure( destination_path.hardlink_to(inode_content) if verbose: console.print(f"Hardlinking {resolved_path} from prov_log") - elif any(resolved_path.is_relative_to(forbidden_path) for forbidden_path in forbidden_paths): + elif any( + resolved_path.is_relative_to(forbidden_path) + for forbidden_path in forbidden_paths + ): if verbose: console.print(f"Skipping {resolved_path}") elif resolved_path.exists(): if ivl is not None and InodeVersionLog.from_path(resolved_path) != ivl: - warnings.warn(f"{resolved_path} changed in between the time of `probe record` and now.") + warnings.warn( + f"{resolved_path} changed in between the time of `probe record` and now." + ) if resolved_path.is_dir(): destination_path.mkdir(exist_ok=True, parents=True) elif copy: if verbose: console.print(f"Copying {resolved_path} from disk") shutil.copy2(resolved_path, destination_path) - else: # not directory and hardlink + else: # not directory and hardlink if verbose: console.print(f"Hardlinking {resolved_path} from disk") destination_path.hardlink_to(resolved_path) @@ -240,11 +268,11 @@ def copy_file_closure( def resolve_path( - fds: typing.Mapping[int, pathlib.Path], - path: Path, + fds: typing.Mapping[int, pathlib.Path], + path: Path, ) -> pathlib.Path: if path.path.startswith(b"/"): - return pathlib.Path(path.path.decode()) # what a mouthful + return pathlib.Path(path.path.decode()) # what a mouthful elif dir_path := fds.get(path.dirfd): return dir_path / pathlib.Path(path.path.decode()) else: @@ -262,6 +290,7 @@ def get_root_pid(prov_log: ProvLog) -> int | None: ldd_regex = re.compile(r"\s+(?P/[a-zA-Z0-9./-]+)\s+") ldd = shutil.which("ldd") + def _get_dlibs(exe_or_dlib: pathlib.Path, found: set[str]) -> None: if not ldd: raise ValueError("ldd not found") diff --git a/probe_py/probe_py/graph_utils.py b/probe_py/probe_py/graph_utils.py new file mode 100644 index 00000000..f72df419 --- /dev/null +++ b/probe_py/probe_py/graph_utils.py @@ -0,0 +1,45 @@ +import typing +import pathlib +import networkx # type: ignore + + +_Node = typing.TypeVar("_Node") + + +if typing.TYPE_CHECKING: + DiGraph: typing.TypeAlias = networkx.DiGraph[_Node] +else: + + class DiGraph(typing.Generic[_Node], networkx.DiGraph): + pass + + +def serialize_graph( + graph: DiGraph[_Node], + output: pathlib.Path, +) -> None: + pydot_graph = networkx.drawing.nx_pydot.to_pydot(graph) + if output.suffix == "dot": + pydot_graph.write_raw(output) + else: + pydot_graph.write_png(output) + + +def relax_node(graph: DiGraph[_Node], node: _Node) -> list[tuple[_Node, _Node]]: + """Remove node from graph and attach its predecessors to its successors""" + ret = list[tuple[typing.Any, typing.Any]]() + for predecessor in graph.predecessors(node): + for successor in graph.successors(node): + ret.append((predecessor, successor)) + graph.add_edge(predecessor, successor) + graph.remove_node(node) + return ret + + +def list_edges_from_start_node( + graph: DiGraph[_Node], start_node: _Node +) -> typing.Iterable[tuple[_Node, _Node]]: + all_edges = list(graph.edges()) + start_index = next(i for i, edge in enumerate(all_edges) if edge[0] == start_node) + ordered_edges = all_edges[start_index:] + all_edges[:start_index] + return ordered_edges diff --git a/probe_src/frontend/python/probe_py/generated/parser.py b/probe_py/probe_py/parser.py similarity index 58% rename from probe_src/frontend/python/probe_py/generated/parser.py rename to probe_py/probe_py/parser.py index fb46508b..7927546a 100644 --- a/probe_src/frontend/python/probe_py/generated/parser.py +++ b/probe_py/probe_py/parser.py @@ -1,62 +1,24 @@ from __future__ import annotations -import os -import contextlib -import tempfile import pathlib import typing import json import tarfile -from dataclasses import dataclass, replace +import tempfile +import contextlib from . import ops - -@dataclass(frozen=True) -class ThreadProvLog: - tid: int - ops: typing.Sequence[ops.Op] - -@dataclass(frozen=True) -class ExecEpochProvLog: - epoch: int - threads: typing.Mapping[int, ThreadProvLog] - - -@dataclass(frozen=True) -class ProcessProvLog: - pid: int - exec_epochs: typing.Mapping[int, ExecEpochProvLog] +from .ptypes import ( + ProvLog, + InodeVersionLog, + ThreadProvLog, + ExecEpochProvLog, + ProcessProvLog, +) +from dataclasses import replace -@dataclass(frozen=True) -class InodeVersionLog: - device_major: int - device_minor: int - inode: int - tv_sec: int - tv_nsec: int - size: int - - @staticmethod - def from_path(path: pathlib.Path) -> InodeVersionLog: - s = path.stat() - return InodeVersionLog( - os.major(s.st_dev), - os.minor(s.st_dev), - s.st_ino, - s.st_mtime_ns // int(1e9), - s.st_mtime_ns % int(1e9), - s.st_size, - ) - - -@dataclass(frozen=True) -class ProvLog: - processes: typing.Mapping[int, ProcessProvLog] - inodes: typing.Mapping[InodeVersionLog, pathlib.Path] - has_inodes: bool - @contextlib.contextmanager def parse_probe_log_ctx( - probe_log: pathlib.Path, + probe_log: pathlib.Path, ) -> typing.Iterator[ProvLog]: """Parse probe log; return provenance data and inode contents""" with tempfile.TemporaryDirectory() as _tmpdir: @@ -64,13 +26,16 @@ def parse_probe_log_ctx( with tarfile.open(probe_log, mode="r") as tar: tar.extractall(tmpdir, filter="data") has_inodes = (tmpdir / "info" / "copy_files").exists() - inodes = { - InodeVersionLog(*[ - int(segment, 16) - for segment in file.name.split("-") - ]): file - for file in (tmpdir / "inodes").iterdir() - } if (tmpdir / "inodes").exists() else {} + inodes = ( + { + InodeVersionLog( + *[int(segment, 16) for segment in file.name.split("-")] + ): file + for file in (tmpdir / "inodes").iterdir() + } + if (tmpdir / "inodes").exists() + else {} + ) processes = {} for pid_dir in (tmpdir / "pids").iterdir(): @@ -83,18 +48,22 @@ def parse_probe_log_ctx( tid = int(tid_file.name) # read, split, comprehend, deserialize, extend jsonlines = tid_file.read_text().strip().split("\n") - tids[tid] = ThreadProvLog(tid, [json.loads(x, object_hook=op_hook) for x in jsonlines]) + tids[tid] = ThreadProvLog( + tid, [json.loads(x, object_hook=op_hook) for x in jsonlines] + ) epochs[epoch] = ExecEpochProvLog(epoch, tids) processes[pid] = ProcessProvLog(pid, epochs) yield ProvLog(processes, inodes, has_inodes) + def parse_probe_log( - probe_log: pathlib.Path, + probe_log: pathlib.Path, ) -> ProvLog: """Parse probe log; return provenance data, but throw away inode contents""" with parse_probe_log_ctx(probe_log) as prov_log: return replace(prov_log, has_inodes=False, inodes={}) + def op_hook(json_map: typing.Dict[str, typing.Any]) -> typing.Any: ty: str = json_map["_type"] json_map.pop("_type") @@ -102,6 +71,7 @@ def op_hook(json_map: typing.Dict[str, typing.Any]) -> typing.Any: constructor = ops.__dict__[ty] # HACK: convert jsonlines' lists of integers into python byte types + # This is because json cannot actually represent byte strings, only unicode strings. for ident, ty in constructor.__annotations__.items(): if ty == "bytes" and ident in json_map: json_map[ident] = bytes(json_map[ident]) diff --git a/probe_src/python/probe_py/manual/persistent_provenance.py b/probe_py/probe_py/persistent_provenance.py similarity index 81% rename from probe_src/python/probe_py/manual/persistent_provenance.py rename to probe_py/probe_py/persistent_provenance.py index 35f660bb..05c122b8 100644 --- a/probe_src/python/probe_py/manual/persistent_provenance.py +++ b/probe_py/probe_py/persistent_provenance.py @@ -54,14 +54,14 @@ class Inode: def to_dict(self) -> dict[str, int | str]: return { - 'host': self.host, - 'device_major': self.device_major, - 'device_minor': self.device_minor, - 'inode': self.inode, + "host": self.host, + "device_major": self.device_major, + "device_minor": self.device_minor, + "inode": self.inode, } def str_id(self) -> str: - hex_part = self.host.split('.')[0] + hex_part = self.host.split(".")[0] if hex_part: number = int(hex_part, 16) else: @@ -69,7 +69,9 @@ def str_id(self) -> str: return f"{number:012x}-{self.device_major:04x}-{self.device_minor:04x}-{self.inode:016x}" @staticmethod - def from_local_path(path: pathlib.Path, stat_info: None | os.stat_result = None) -> Inode: + def from_local_path( + path: pathlib.Path, stat_info: None | os.stat_result = None + ) -> Inode: if stat_info is None: stat_info = os.stat(path) device_major = os.major(stat_info.st_dev) @@ -102,14 +104,16 @@ class InodeVersion: # - Cry. def to_dict(self) -> dict[str, typing.Any]: - data = {"mtime": self.mtime, 'inode': self.inode.to_dict(), "size": self.size} + data = {"mtime": self.mtime, "inode": self.inode.to_dict(), "size": self.size} return data def str_id(self) -> str: return f"{self.inode.str_id()}-{self.mtime:016x}-{self.size:016x}" @staticmethod - def from_local_path(path: pathlib.Path, stat_info: os.stat_result | None) -> InodeVersion: + def from_local_path( + path: pathlib.Path, stat_info: os.stat_result | None + ) -> InodeVersion: if stat_info is None: stat_info = os.stat(path) mtime = int(stat_info.st_mtime * 1_000_000_000) @@ -135,7 +139,9 @@ def to_dict(self) -> dict[str, typing.Any]: } @staticmethod - def from_local_path(path: pathlib.Path, stat_info: os.stat_result | None) -> InodeMetadata: + def from_local_path( + path: pathlib.Path, stat_info: os.stat_result | None + ) -> InodeMetadata: if stat_info is None: stat_info = os.stat(path) return InodeMetadata( @@ -161,29 +167,39 @@ class Process: def to_dict(self) -> dict[str, typing.Any]: return { - 'input_inodes': [inode_version.to_dict() for inode_version in self.input_inodes], - 'input_inode_metadatas': [metadata.to_dict() for metadata in self.input_inode_metadatas], - 'output_inodes': [inode_version.to_dict() for inode_version in self.output_inodes], - 'output_inode_metadatas': [metadata.to_dict() for metadata in self.output_inode_metadatas], - 'time': self.time.isoformat(), - 'cmd': list(self.cmd), - 'pid': self.pid, - 'env': [tuple(env_item) for env_item in self.env], - 'wd': str(self.wd), + "input_inodes": [ + inode_version.to_dict() for inode_version in self.input_inodes + ], + "input_inode_metadatas": [ + metadata.to_dict() for metadata in self.input_inode_metadatas + ], + "output_inodes": [ + inode_version.to_dict() for inode_version in self.output_inodes + ], + "output_inode_metadatas": [ + metadata.to_dict() for metadata in self.output_inode_metadatas + ], + "time": self.time.isoformat(), + "cmd": list(self.cmd), + "pid": self.pid, + "env": [tuple(env_item) for env_item in self.env], + "wd": str(self.wd), } # TODO: implement this for remote host def get_prov_upstream( - root_inode_version: list[InodeVersion], - host: str, + root_inode_version: list[InodeVersion], + host: str, ) -> tuple[dict[int, Process], dict[InodeVersion, int | None]]: """ This function answers: What do we need to reconstruct the provenance of root_inode_version on another host? The answer is a set of Process objects and a map of InodeVersion writes. """ if host != "local": - raise NotImplementedError("scp where source is remote is not implemented, because it would be hard to copy the remote prov") + raise NotImplementedError( + "scp where source is remote is not implemented, because it would be hard to copy the remote prov" + ) inode_version_queue = list[InodeVersion]() inode_version_queue.extend(root_inode_version) @@ -194,7 +210,9 @@ def get_prov_upstream( while inode_version_queue: inode_version = inode_version_queue.pop() if inode_version not in inode_version_writes: - process_id_path = PROCESS_ID_THAT_WROTE_INODE_VERSION / inode_version.str_id() + process_id_path = ( + PROCESS_ID_THAT_WROTE_INODE_VERSION / inode_version.str_id() + ) if process_id_path.exists(): process_id = json.loads(process_id_path.read_text()) inode_version_writes[inode_version] = process_id diff --git a/probe_src/python/probe_py/manual/persistent_provenance_db.py b/probe_py/probe_py/persistent_provenance_db.py similarity index 85% rename from probe_src/python/probe_py/manual/persistent_provenance_db.py rename to probe_py/probe_py/persistent_provenance_db.py index 51da5e4e..486ad58d 100644 --- a/probe_src/python/probe_py/manual/persistent_provenance_db.py +++ b/probe_py/probe_py/persistent_provenance_db.py @@ -5,23 +5,28 @@ import pathlib from datetime import datetime + class Base(DeclarativeBase): pass -_engine = None -def get_engine()->Engine: + +_engine: Engine | None = None + + +def get_engine() -> Engine: global _engine if _engine is None: home = pathlib.Path(xdg_base_dirs.xdg_data_home()) home.mkdir(parents=True, exist_ok=True) database_path = home / "probe_log.db" - _engine = create_engine(f'sqlite:///{database_path}', echo=True) + _engine = create_engine(f"sqlite:///{database_path}", echo=True) Base.metadata.create_all(_engine) return _engine + class ProcessThatWrites(Base): - __tablename__ = 'process_that_writes' + __tablename__ = "process_that_writes" id: Mapped[int] = mapped_column(primary_key=True, auto_increment=True) inode: Mapped[int] @@ -36,7 +41,7 @@ class ProcessThatWrites(Base): class Process(Base): - __tablename__ = 'process' + __tablename__ = "process" process_id: Mapped[int] = mapped_column(primary_key=True) parent_process_id: Mapped[int] @@ -45,7 +50,7 @@ class Process(Base): class ProcessInputs(Base): - __tablename__ = 'process_inputs' + __tablename__ = "process_inputs" id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) inode: Mapped[int] diff --git a/probe_py/probe_py/ptypes.py b/probe_py/probe_py/ptypes.py new file mode 100644 index 00000000..f83ab26f --- /dev/null +++ b/probe_py/probe_py/ptypes.py @@ -0,0 +1,62 @@ +from __future__ import annotations +import pathlib +from . import ops +import os +from dataclasses import dataclass +import enum +import typing + + +@dataclass(frozen=True) +class ThreadProvLog: + tid: int + ops: typing.Sequence[ops.Op] + + +@dataclass(frozen=True) +class ExecEpochProvLog: + epoch: int + threads: typing.Mapping[int, ThreadProvLog] + + +@dataclass(frozen=True) +class ProcessProvLog: + pid: int + exec_epochs: typing.Mapping[int, ExecEpochProvLog] + + +@dataclass(frozen=True) +class InodeVersionLog: + device_major: int + device_minor: int + inode: int + tv_sec: int + tv_nsec: int + size: int + + @staticmethod + def from_path(path: pathlib.Path) -> InodeVersionLog: + s = path.stat() + return InodeVersionLog( + os.major(s.st_dev), + os.minor(s.st_dev), + s.st_ino, + s.st_mtime_ns // int(1e9), + s.st_mtime_ns % int(1e9), + s.st_size, + ) + + +@dataclass(frozen=True) +class ProvLog: + processes: typing.Mapping[int, ProcessProvLog] + inodes: typing.Mapping[InodeVersionLog, pathlib.Path] + has_inodes: bool + + +# TODO: implement this in probe_py.generated.ops +class TaskType(enum.IntEnum): + TASK_PID = 0 + TASK_TID = 1 + TASK_ISO_C_THREAD = 2 + TASK_PTHREAD = 3 diff --git a/probe_src/frontend/python/probe_py/generated/py.typed b/probe_py/probe_py/py.typed similarity index 100% rename from probe_src/frontend/python/probe_py/generated/py.typed rename to probe_py/probe_py/py.typed diff --git a/probe_src/python/probe_py/manual/remote_access.py b/probe_py/probe_py/remote_access.py similarity index 75% rename from probe_src/python/probe_py/manual/remote_access.py rename to probe_py/probe_py/remote_access.py index d08b9c2e..59827883 100644 --- a/probe_src/python/probe_py/manual/remote_access.py +++ b/probe_py/probe_py/remote_access.py @@ -1,6 +1,6 @@ import dataclasses -from probe_py.manual.persistent_provenance import ( +from probe_py.persistent_provenance import ( Inode, InodeVersion, InodeMetadata, @@ -23,6 +23,7 @@ PROCESS_ID_THAT_WROTE_INODE_VERSION = PROBE_HOME / "process_id_that_wrote_inode_version" PROCESSES_BY_ID = PROBE_HOME / "processes_by_id" + @dataclasses.dataclass(frozen=True) class Host: network_name: str | None @@ -51,10 +52,14 @@ class HostPath: path: pathlib.Path -def copy_provenance(source: HostPath, destination: HostPath, cmd: tuple[str, ...]) -> None: +def copy_provenance( + source: HostPath, destination: HostPath, cmd: tuple[str, ...] +) -> None: provenance_info_source = lookup_provenance_source(source) provenance_info_destination = lookup_provenance_destination(source, destination) - provenance_info = augment_provenance(provenance_info_source, provenance_info_destination, cmd) + provenance_info = augment_provenance( + provenance_info_source, provenance_info_destination, cmd + ) # TODO: Support uploading all the provenance_info from mulitple sources at once # Either copy_provenance should take multiple sources # Or it should return the provenance rather than uploading it @@ -80,7 +85,10 @@ def lookup_provenance_source(source: HostPath) -> ProvenanceInfo: else: return lookup_provenance_remote(source.host, source.path, True) -def lookup_provenance_destination(source: HostPath, destination: HostPath) -> ProvenanceInfo: + +def lookup_provenance_destination( + source: HostPath, destination: HostPath +) -> ProvenanceInfo: source_path = source.path if source_path.is_dir(): source_files = get_descendants(source_path, False) @@ -92,22 +100,34 @@ def lookup_provenance_destination(source: HostPath, destination: HostPath) -> Pr for path in source_files: destination_path = destination.path / path.name if destination.host.local: - inode_version, inode_metadata, _process_map, _inode_map = lookup_provenance_local(destination_path, False) + inode_version, inode_metadata, _process_map, _inode_map = ( + lookup_provenance_local(destination_path, False) + ) else: - inode_version, inode_metadata, _process_map, _inode_map = lookup_provenance_remote(destination.host, destination_path, False) + inode_version, inode_metadata, _process_map, _inode_map = ( + lookup_provenance_remote(destination.host, destination_path, False) + ) inode_versions.extend(inode_version) inode_metadatas.extend(inode_metadata) return inode_versions, inode_metadatas, {}, {} + def augment_provenance( - source_provenance_info: ProvenanceInfo, - destination_provenance_info: ProvenanceInfo, - cmd: tuple[str, ...], + source_provenance_info: ProvenanceInfo, + destination_provenance_info: ProvenanceInfo, + cmd: tuple[str, ...], ) -> ProvenanceInfo: """Given provenance_info of files on a previous host, insert nodes to represent a remote transfer to destination.""" - source_inode_versions, source_inode_metadatas, process_closure, inode_writes = source_provenance_info - destination_inode_versions, destination_inode_metadatas, _process_closure, _inode_writes = destination_provenance_info + source_inode_versions, source_inode_metadatas, process_closure, inode_writes = ( + source_provenance_info + ) + ( + destination_inode_versions, + destination_inode_metadatas, + _process_closure, + _inode_writes, + ) = destination_provenance_info scp_process_id = generate_random_pid() time = datetime.datetime.today() env: tuple[tuple[str, str], ...] = () @@ -132,7 +152,13 @@ def augment_provenance( for destination_inode_version in destination_inode_versions: inode_writes[destination_inode_version] = scp_process_id - return destination_inode_versions, destination_inode_metadatas , process_closure, inode_writes + return ( + destination_inode_versions, + destination_inode_metadatas, + process_closure, + inode_writes, + ) + def upload_provenance(dest: Host, provenance_info: ProvenanceInfo) -> None: if dest.local: @@ -140,7 +166,10 @@ def upload_provenance(dest: Host, provenance_info: ProvenanceInfo) -> None: else: upload_provenance_remote(dest, provenance_info) -def create_directories_on_remote(remote_home: pathlib.Path, remote: Host, ssh_options: list[str]) -> None: + +def create_directories_on_remote( + remote_home: pathlib.Path, remote: Host, ssh_options: list[str] +) -> None: remote_directories = [ f"{remote_home}/processes_by_id", f"{remote_home}/process_id_that_wrote_inode_version", # Add more directories as needed @@ -155,12 +184,16 @@ def create_directories_on_remote(remote_home: pathlib.Path, remote: Host, ssh_op mkdir_command.insert(-1, option) for directory in remote_directories: - mkdir_command.append(f"mkdir -p {directory}", ) + mkdir_command.append( + f"mkdir -p {directory}", + ) subprocess.run(mkdir_command, check=True) mkdir_command.pop() -def get_stat_results_remote(remote: Host, file_path: pathlib.Path, ssh_options: list[str]) -> bytes: +def get_stat_results_remote( + remote: Host, file_path: pathlib.Path, ssh_options: list[str] +) -> bytes: remote_scp_address = remote.get_address() ssh_command = [ "ssh", @@ -171,7 +204,9 @@ def get_stat_results_remote(remote: Host, file_path: pathlib.Path, ssh_options: ssh_command.append(f'stat -c "size: %s\nmode: 0x%f\n" {file_path}') try: - result = subprocess.run(ssh_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + result = subprocess.run( + ssh_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) output = result.stdout stats = yaml.safe_load(output) except subprocess.CalledProcessError as e: @@ -180,6 +215,7 @@ def get_stat_results_remote(remote: Host, file_path: pathlib.Path, ssh_options: file_size = stats["size"] return bytes(file_size) + def generate_random_pid() -> int: min_pid = 1 max_pid = 32767 @@ -187,7 +223,9 @@ def generate_random_pid() -> int: return random_pid -def get_descendants(root: pathlib.Path, include_directories: bool) -> list[pathlib.Path]: +def get_descendants( + root: pathlib.Path, include_directories: bool +) -> list[pathlib.Path]: queue = [root] ret = [] while queue: @@ -201,7 +239,9 @@ def get_descendants(root: pathlib.Path, include_directories: bool) -> list[pathl return ret -def lookup_provenance_local(path: pathlib.Path, get_persistent_provenance: bool) -> ProvenanceInfo: +def lookup_provenance_local( + path: pathlib.Path, get_persistent_provenance: bool +) -> ProvenanceInfo: if path.is_dir(): inode_versions = [ InodeVersion.from_local_path(descendant, None) @@ -220,8 +260,9 @@ def lookup_provenance_local(path: pathlib.Path, get_persistent_provenance: bool) return inode_versions, inode_metadatas, {}, {} - -def lookup_provenance_remote(host: Host, path: pathlib.Path, get_persistent_provenance: bool) -> ProvenanceInfo: +def lookup_provenance_remote( + host: Host, path: pathlib.Path, get_persistent_provenance: bool +) -> ProvenanceInfo: address = host.get_address() assert address is not None commands = [ @@ -256,35 +297,52 @@ def lookup_provenance_remote(host: Host, path: pathlib.Path, get_persistent_prov fields = proc.stdout.split("|") node_name = fields[0] - #cwd = pathlib.Path(fields[1]) + # cwd = pathlib.Path(fields[1]) inode_metadatas = [] inode_versions = [] - for _child_path, device, inode, mtime, size, mode, nlink, uid, gid in itertools.batched(fields[2:11], 10): - inode_object = Inode(node_name, os.major(int(device)), os.minor(int(device)), int(inode)) + for ( + _child_path, + device, + inode, + mtime, + size, + mode, + nlink, + uid, + gid, + ) in itertools.batched(fields[2:11], 10): + inode_object = Inode( + node_name, os.major(int(device)), os.minor(int(device)), int(inode) + ) inode_versions.append(InodeVersion(inode_object, int(float(mtime)), int(size))) - inode_metadatas.append(InodeMetadata(inode_object, int(mode), int(nlink), int(uid), int(gid))) + inode_metadatas.append( + InodeMetadata(inode_object, int(mode), int(nlink), int(uid), int(gid)) + ) if not get_persistent_provenance: return inode_versions, inode_metadatas, {}, {} - files_to_read:list[str] = [] + files_to_read: list[str] = [] # TODO: Implement this subprocess.run( [ "ssh", *host.ssh_options, address, - "sh", "-c", ";".join([ - "probe_data=${XDG_DATA_HOME:-$HOME/.local/share}/PROBE", - "processes_by_id=${probe_data}/process_id_that_wrote_inode_version", - "process_that_wrote=${probe_data}/processes_by_id", - - # cat the relevant stuff - *[ - f"cat $processes_by_id/{inode}.json && echo '\0'" - for inode in files_to_read - ], - ]), + "sh", + "-c", + ";".join( + [ + "probe_data=${XDG_DATA_HOME:-$HOME/.local/share}/PROBE", + "processes_by_id=${probe_data}/process_id_that_wrote_inode_version", + "process_that_wrote=${probe_data}/processes_by_id", + # cat the relevant stuff + *[ + f"cat $processes_by_id/{inode}.json && echo '\0'" + for inode in files_to_read + ], + ] + ), ], capture_output=True, check=True, @@ -296,10 +354,17 @@ def lookup_provenance_remote(host: Host, path: pathlib.Path, get_persistent_prov def upload_provenance_local(provenance_info: ProvenanceInfo) -> None: - destination_inode_versions, destination_inode_metadatas, augmented_process_closure, augmented_inode_writes = provenance_info + ( + destination_inode_versions, + destination_inode_metadatas, + augmented_process_closure, + augmented_inode_writes, + ) = provenance_info for inode_version, process_id in augmented_inode_writes.items(): - inode_version_path = PROCESS_ID_THAT_WROTE_INODE_VERSION / f"{inode_version.str_id()}.json" + inode_version_path = ( + PROCESS_ID_THAT_WROTE_INODE_VERSION / f"{inode_version.str_id()}.json" + ) with inode_version_path.open("w") as f: json.dump(process_id if process_id is not None else None, f) @@ -310,12 +375,19 @@ def upload_provenance_local(provenance_info: ProvenanceInfo) -> None: def upload_provenance_remote(dest: Host, provenance_info: ProvenanceInfo) -> None: - destination_inode_versions, destination_inode_metadatas, augmented_process_closure, augmented_inode_writes = provenance_info + ( + destination_inode_versions, + destination_inode_metadatas, + augmented_process_closure, + augmented_inode_writes, + ) = provenance_info for inode_version, process_id in augmented_inode_writes.items(): if inode_version not in destination_inode_versions: continue - inode_version_path = PROCESS_ID_THAT_WROTE_INODE_VERSION / f"{inode_version.str_id()}.json" + inode_version_path = ( + PROCESS_ID_THAT_WROTE_INODE_VERSION / f"{inode_version.str_id()}.json" + ) os.makedirs(inode_version_path.parent, exist_ok=True) with inode_version_path.open("w") as f: json.dump(process_id if process_id is not None else None, f) @@ -325,12 +397,12 @@ def upload_provenance_remote(dest: Host, provenance_info: ProvenanceInfo) -> Non for inode_version, process_id in augmented_inode_writes.items(): inode_version_str_id = inode_version.str_id() echo_commands.append( - f"echo {shlex.quote(json.dumps(process_id))} > \"${{process_that_wrote}}/{inode_version_str_id}.json\"" + f'echo {shlex.quote(json.dumps(process_id))} > "${{process_that_wrote}}/{inode_version_str_id}.json"' ) for process_id, process in augmented_process_closure.items(): echo_commands.append( - f"echo {(json.dumps(process.to_dict()))} > \"${{processes_by_id}}/{str(process_id)}.json\"" + f'echo {(json.dumps(process.to_dict()))} > "${{processes_by_id}}/{str(process_id)}.json"' ) commands = [ @@ -360,6 +432,7 @@ def upload_provenance_remote(dest: Host, provenance_info: ProvenanceInfo) -> Non text=True, ) + # Notes: # - scp.py is the driver and remote_access.py is the library. This way, remote_access.py can be re-imported into ssh. It makes more sense to me. # - Parse options completely in 1 function. Rather than have parsing scattered in different functions. diff --git a/probe_src/python/probe_py/manual/scp.py b/probe_py/probe_py/scp.py similarity index 66% rename from probe_src/python/probe_py/manual/scp.py rename to probe_py/probe_py/scp.py index d3c65d24..8a069d99 100644 --- a/probe_src/python/probe_py/manual/scp.py +++ b/probe_py/probe_py/scp.py @@ -2,7 +2,7 @@ import re import itertools import subprocess -from probe_py.manual.remote_access import Host, HostPath, copy_provenance +from probe_py.remote_access import Host, HostPath, copy_provenance def scp_with_provenance(scp_args: list[str]) -> int: @@ -35,12 +35,12 @@ def parse_scp_args(scp_args: list[str]) -> tuple[list[HostPath], HostPath]: Note that the Host type contains the instructions/options needed to connect to it. """ - scp_no_arg_options = {'-3', '-B', '-O', '-p', '-q', '-R', '-r', '-T'} - scp_one_arg_options = {'-3', '-B', '-D', '-l', '-S', '-X'} - common_no_arg_options = {'-4', '-6', '-A', '-C', '-v', '-q', '-v'} - common_one_arg_options = {'-c', '-F', '-i', '-J', '-o', '-v', '-q'} + scp_no_arg_options = {"-3", "-B", "-O", "-p", "-q", "-R", "-r", "-T"} + scp_one_arg_options = {"-3", "-B", "-D", "-l", "-S", "-X"} + common_no_arg_options = {"-4", "-6", "-A", "-C", "-v", "-q", "-v"} + common_one_arg_options = {"-c", "-F", "-i", "-J", "-o", "-v", "-q"} mapped_one_arg_options = { - '-P': '-p', + "-P": "-p", } scp_options = [] @@ -53,16 +53,22 @@ def parse_scp_args(scp_args: list[str]) -> tuple[list[HostPath], HostPath]: # TODO: This is not strictly accurate. # I would need to read SCP's source code or play around with it a bit to know for sure. # I believe "scp -oProxyCommand=foobar source dest" is valid - scp_args = list(itertools.chain.from_iterable([ - [f"-{option}" for option in arg[1:]] if arg.startswith("-") else [arg] - for arg in scp_args - ])) + scp_args = list( + itertools.chain.from_iterable( + [ + [f"-{option}" for option in arg[1:]] if arg.startswith("-") else [arg] + for arg in scp_args + ] + ) + ) i = 0 while i < len(scp_args): arg = scp_args[i] if arg.startswith("-"): - assert len(arg) == 2, f"We should have already replaced -abc with -a -b -c, yet we have {arg}" + assert ( + len(arg) == 2 + ), f"We should have already replaced -abc with -a -b -c, yet we have {arg}" if arg[1:] in scp_no_arg_options: scp_options.append(arg) elif arg[1:] in scp_one_arg_options: @@ -98,20 +104,31 @@ def parse_scp_args(scp_args: list[str]) -> tuple[list[HostPath], HostPath]: this_scp_options.append(match.group("port")) this_ssh_options.append("-P") this_ssh_options.append(match.group("port")) - sources.append(HostPath( - Host(match.group("host"), match.group("user"), this_ssh_options, this_scp_options), - Path(match.group("path") if match.group("path") else "") - )) + sources.append( + HostPath( + Host( + match.group("host"), + match.group("user"), + this_ssh_options, + this_scp_options, + ), + Path(match.group("path") if match.group("path") else ""), + ) + ) elif match := re.match(scp_path_regex, arg): - sources.append(HostPath( - Host(None, None, [], []), - Path(arg) - )) + sources.append(HostPath(Host(None, None, [], []), Path(arg))) elif match := re.match(scp_address_regex, arg): - sources.append(HostPath( - Host(match.group("host"), match.group("user"), ssh_options, scp_options), - Path(match.group("path") if match.group("path") else "") - )) + sources.append( + HostPath( + Host( + match.group("host"), + match.group("user"), + ssh_options, + scp_options, + ), + Path(match.group("path") if match.group("path") else ""), + ) + ) else: print(scp_url_regex) print(scp_address_regex) @@ -120,18 +137,24 @@ def parse_scp_args(scp_args: list[str]) -> tuple[list[HostPath], HostPath]: return sources[:-1], sources[-1] - # Define some regexp helpers. # In my opinion, "spelling" the regexp this way makes it much more readable. def concat(*args: str) -> str: return "".join(args) + + def optional(arg: str) -> str: return f"(?:{arg})?" + + def named_group(name: str, arg: str) -> str: return f"(?P<{name}>{arg})?" + + def whole_string(arg: str) -> str: return "^" + arg + "$" + # TODO: do options only apply to the host following it? # E.g., does scp -J host-A host-B host-C only apply a jump-host to the host-B? @@ -139,18 +162,22 @@ def whole_string(arg: str) -> str: unix_username_regex = "[a-z][-a-z0-9_]*" host_regex = r"[a-zA-Z0-9\.-]{1,63}" # scp://[user@]host[:port][/path] -scp_url_regex = whole_string(concat( - "scp://", - optional(concat(named_group("user", unix_username_regex), "@")), - named_group("host", host_regex), - optional(concat(":", named_group("port", r"\d+"))), - optional(concat("/", named_group("path", ".*"))), -)) +scp_url_regex = whole_string( + concat( + "scp://", + optional(concat(named_group("user", unix_username_regex), "@")), + named_group("host", host_regex), + optional(concat(":", named_group("port", r"\d+"))), + optional(concat("/", named_group("path", ".*"))), + ) +) path_regex = "[^:@]*" scp_path_regex = whole_string(path_regex) # [user@]host[:path] -scp_address_regex = whole_string(concat( - optional(concat(named_group("user", unix_username_regex), "@")), - named_group("host", host_regex), - optional(concat(":", named_group("path", path_regex))), -)) +scp_address_regex = whole_string( + concat( + optional(concat(named_group("user", unix_username_regex), "@")), + named_group("host", host_regex), + optional(concat(":", named_group("path", path_regex))), + ) +) diff --git a/probe_src/python/probe_py/manual/ssh_argparser.py b/probe_py/probe_py/ssh_argparser.py similarity index 68% rename from probe_src/python/probe_py/manual/ssh_argparser.py rename to probe_py/probe_py/ssh_argparser.py index 9c024e86..5489ff91 100644 --- a/probe_src/python/probe_py/manual/ssh_argparser.py +++ b/probe_py/probe_py/ssh_argparser.py @@ -1,9 +1,8 @@ def parse_ssh_args(ssh_args: list[str]) -> tuple[list[str], str, list[str]]: - one_arg_options = set("BbcDEeFIiJLlmOoPpRSWw") no_arg_options = set("46AaCfGgKkMNnqsTtVvXxYy") - state = 'start' + state = "start" i = 0 flags = [] destination = None @@ -12,41 +11,40 @@ def parse_ssh_args(ssh_args: list[str]) -> tuple[list[str], str, list[str]]: while i < len(ssh_args): curr_arg = ssh_args[i] - if state == 'start': + if state == "start": if curr_arg.startswith("-"): - state = 'flag' + state = "flag" elif destination is not None: - state = 'cmd' + state = "cmd" else: - state = 'destination' + state = "destination" - elif state == 'flag': + elif state == "flag": opt = curr_arg[-1] if opt in one_arg_options: - state = 'one_arg' + state = "one_arg" elif opt in no_arg_options: flags.append(curr_arg) - state = 'start' + state = "start" i += 1 - elif state == 'one_arg': + elif state == "one_arg": flags.extend([ssh_args[i - 1], curr_arg]) - state = 'start' + state = "start" i += 1 - elif state == 'destination': + elif state == "destination": if destination is None: destination = curr_arg - state = 'start' + state = "start" else: - state = 'cmd' + state = "cmd" continue i += 1 - elif state == 'cmd': + elif state == "cmd": remote_host.extend(ssh_args[i:]) break assert destination is not None return flags, destination, remote_host - diff --git a/probe_src/python/probe_py/manual/util.py b/probe_py/probe_py/util.py similarity index 86% rename from probe_src/python/probe_py/manual/util.py rename to probe_py/probe_py/util.py index b7acb659..71e2455a 100644 --- a/probe_src/python/probe_py/manual/util.py +++ b/probe_py/probe_py/util.py @@ -24,8 +24,11 @@ def default_tarinfo(path: pathlib.Path | str) -> tarfile.TarInfo: ) -def filter_relative_to(path: pathlib.Path) -> typing.Callable[[tarfile.TarInfo], tarfile.TarInfo]: +def filter_relative_to( + path: pathlib.Path, +) -> typing.Callable[[tarfile.TarInfo], tarfile.TarInfo]: def filter(member: tarfile.TarInfo) -> tarfile.TarInfo: member_path = pathlib.Path(member.name) return member.replace(name=str(member_path.relative_to(path))) + return filter diff --git a/probe_src/python/probe_py/manual/workflows.py b/probe_py/probe_py/workflows.py similarity index 72% rename from probe_src/python/probe_py/manual/workflows.py rename to probe_py/probe_py/workflows.py index db6f131a..9949c653 100644 --- a/probe_src/python/probe_py/manual/workflows.py +++ b/probe_py/probe_py/workflows.py @@ -1,5 +1,5 @@ -from probe_py.manual.analysis import ProcessNode, FileNode -import networkx as nx # type: ignore +from probe_py.analysis import ProcessNode, FileNode +import networkx as nx # type: ignore import abc from typing import List, Set, Optional import pathlib @@ -9,6 +9,7 @@ import subprocess from filecmp import cmp import re + """ All the cases we should take care of: 1- One Input, One Output [x] @@ -22,16 +23,19 @@ 9- File and Directory Structure Assumptions (Scripts that assume a specific directory structure, Commands that change the working directory (cd)) ... """ + + class WorkflowGenerator(abc.ABC): @abc.abstractmethod def generate_workflow(self, graph: nx.DiGraph) -> str: pass + class NextflowGenerator(WorkflowGenerator): def __init__(self) -> None: self.visited: Set[ProcessNode] = set() - self.process_counter: dict[ProcessNode, int] = {} - self.nextflow_script: list[str] = [] + self.process_counter: dict[ProcessNode, int] = {} + self.nextflow_script: list[str] = [] self.workflow: list[str] = [] def escape_filename_for_nextflow(self, filename: str) -> str: @@ -46,19 +50,20 @@ def escape_filename_for_nextflow(self, filename: str) -> str: if char.isalnum(): # Keep letters and numbers unchanged escaped_filename.append(char) else: # Replace other characters with their ASCII code in hex - escaped_filename.append(f'_{ord(char):02x}') + escaped_filename.append(f"_{ord(char):02x}") # Ensure the filename doesn't start with a number by prepending an escape code if escaped_filename and escaped_filename[0].isdigit(): - escaped_filename.insert(0, '_num_') - - return ''.join(escaped_filename) + escaped_filename.insert(0, "_num_") + return "".join(escaped_filename) - def handle_standard_case(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> str: + def handle_standard_case( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> str: input_files = " ".join([f'path "{file.file}"\n ' for file in inputs]) output_files = " ".join([f'path "{file.file}"\n ' for file in outputs]) - + return f""" process process_{id(process)} {{ input: @@ -73,13 +78,18 @@ def handle_standard_case(self, process: ProcessNode, inputs: List[FileNode], out \"\"\" }}""" - - - def handle_inline_case(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> str: - input_files = " ".join([f'path "{os.path.basename(file.file)}"' for file in inputs]) + def handle_inline_case( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> str: + input_files = " ".join( + [f'path "{os.path.basename(file.file)}"' for file in inputs] + ) output_files = " ".join( - [f'path "{os.path.splitext(os.path.basename(file.file))[0]}_modified{os.path.splitext(file.file)[1]}"' for - file in inputs]) + [ + f'path "{os.path.splitext(os.path.basename(file.file))[0]}_modified{os.path.splitext(file.file)[1]}"' + for file in inputs + ] + ) # Build inline commands for each file to perform copy, edit, and rename steps script_commands = [] @@ -92,14 +102,18 @@ def handle_inline_case(self, process: ProcessNode, inputs: List[FileNode], outpu modified_cmd = [] for cmd in process.cmd: # Substitute all occurrences of the original filename in each command - cmd_modified = re.sub(r"/(?:[a-zA-Z0-9_\-./]+/)*([a-zA-Z0-9_\-]+\.txt)", temp_name, cmd) + cmd_modified = re.sub( + r"/(?:[a-zA-Z0-9_\-./]+/)*([a-zA-Z0-9_\-]+\.txt)", temp_name, cmd + ) modified_cmd.append(cmd_modified) - script_commands.extend([ - f'cp {file.file} {temp_name}', # Copy to temp file - " ".join(modified_cmd), # Apply inline edit with temp filename - f'mv {temp_name} {final_name}' # Rename temp file to final output - ]) + script_commands.extend( + [ + f"cp {file.file} {temp_name}", # Copy to temp file + " ".join(modified_cmd), # Apply inline edit with temp filename + f"mv {temp_name} {final_name}", # Rename temp file to final output + ] + ) # Join script commands with newline and indentation for Nextflow process script_block = "\n ".join(script_commands) @@ -119,9 +133,13 @@ def handle_inline_case(self, process: ProcessNode, inputs: List[FileNode], outpu \"\"\" }}""" - def handle_dynamic_filenames(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> str: + def handle_dynamic_filenames( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> str: input_files = " ".join([f'path "{file.file}"\n ' for file in inputs]) - output_files = " ".join([f'path "{file.file}"\n ' for file in outputs if file.file]) + output_files = " ".join( + [f'path "{file.file}"\n ' for file in outputs if file.file] + ) return f""" process process_{id(process)} {{ @@ -168,7 +186,9 @@ def handle_custom_shells(self, process: ProcessNode) -> str: \"\"\" }}""" - def is_inline_editing_command_sandbox(self, command: str, input_files: list[FileNode]) -> bool: + def is_inline_editing_command_sandbox( + self, command: str, input_files: list[FileNode] + ) -> bool: """ Determine if a command modifies any of the input files in-place, even if the content remains the same. """ @@ -211,17 +231,27 @@ def is_inline_editing_command_sandbox(self, command: str, input_files: list[File # Return False if none of the files were modified return False - def is_standard_case(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> bool: + def is_standard_case( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> bool: return len(inputs) >= 1 and len(outputs) == 1 - def is_inline_case(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> bool: - return self.is_inline_editing_command_sandbox(' '.join(process.cmd), inputs) + def is_inline_case( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> bool: + return self.is_inline_editing_command_sandbox(" ".join(process.cmd), inputs) - def is_multiple_output_case(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> bool: + def is_multiple_output_case( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> bool: return len(inputs) >= 1 and len(outputs) >= 1 - - def is_dynamic_filename_case(self, process: ProcessNode, outputs: List[FileNode]) -> bool: - return any("*" in file.file or "v*" in file.file for file in outputs if file.file) + + def is_dynamic_filename_case( + self, process: ProcessNode, outputs: List[FileNode] + ) -> bool: + return any( + "*" in file.file or "v*" in file.file for file in outputs if file.file + ) def is_parallel_execution(self, process: ProcessNode) -> bool: return len(process.cmd) > 1 and "parallel" in process.cmd @@ -232,31 +262,43 @@ def create_processes(self) -> None: """ for node in self.graph.nodes: if isinstance(node, ProcessNode) and node not in self.visited: - inputs = [n for n in self.graph.predecessors(node) if isinstance(n, FileNode)] - outputs = [n for n in self.graph.successors(node) if isinstance(n, FileNode)] + inputs = [ + n for n in self.graph.predecessors(node) if isinstance(n, FileNode) + ] + outputs = [ + n for n in self.graph.successors(node) if isinstance(n, FileNode) + ] if self.is_standard_case(node, inputs, outputs): process_script = self.handle_standard_case(node, inputs, outputs) self.nextflow_script.append(process_script) - self.workflow.append(f"{self.escape_filename_for_nextflow(outputs[0].label)} = process_{id(node)}({', '.join([self.escape_filename_for_nextflow(i.label) for i in inputs])})") - elif self.is_multiple_output_case(node,inputs,outputs) : - raise NotImplementedError("Handling multiple outputs not implemented yet.") + self.workflow.append( + f"{self.escape_filename_for_nextflow(outputs[0].label)} = process_{id(node)}({', '.join([self.escape_filename_for_nextflow(i.label) for i in inputs])})" + ) + elif self.is_multiple_output_case(node, inputs, outputs): + raise NotImplementedError( + "Handling multiple outputs not implemented yet." + ) elif self.is_dynamic_filename_case(node, outputs): - process_script = self.handle_dynamic_filenames(node, inputs, outputs) + process_script = self.handle_dynamic_filenames( + node, inputs, outputs + ) elif self.is_parallel_execution(node): process_script = self.handle_parallel_execution(node) - elif self.is_inline_case(node, inputs, outputs): + elif self.is_inline_case(node, inputs, outputs): process_script = self.handle_inline_case(node, inputs, outputs) self.nextflow_script.append(process_script) - self.workflow.append(f"process_{id(node)}({', '.join([self.escape_filename_for_nextflow(i.label) for i in inputs])})") + self.workflow.append( + f"process_{id(node)}({', '.join([self.escape_filename_for_nextflow(i.label) for i in inputs])})" + ) else: process_script = self.handle_custom_shells(node) self.nextflow_script.append(process_script) self.workflow.append(f"process_{id(node)}()") self.visited.add(node) - - def generate_workflow(self, graph: nx.DiGraph) -> str: + + def generate_workflow(self, graph: nx.DiGraph) -> str: """ Generate the complete Nextflow workflow script from the graph. """ @@ -274,10 +316,11 @@ def generate_workflow(self, graph: nx.DiGraph) -> str: escaped_name = self.escape_filename_for_nextflow(node.label) if node.inodeOnDevice not in filenames: if pathlib.Path(node.file).exists(): - self.nextflow_script.append(f" {escaped_name}=file(\"{node.file}\")") + self.nextflow_script.append( + f' {escaped_name}=file("{node.file}")' + ) filenames.add(node.inodeOnDevice) - for step in self.workflow: self.nextflow_script.append(f" {step}") self.nextflow_script.append("}") @@ -296,14 +339,19 @@ def escape_filename_for_makefile(self, filename: str) -> str: Escape special characters in a filename for Makefile. Replace spaces and other special characters with underscores. """ - return filename.replace(" ", "_").replace("(", "_").replace(")", "_").replace(",", "_") + return ( + filename.replace(" ", "_") + .replace("(", "_") + .replace(")", "_") + .replace(",", "_") + ) def is_hidden_file(self, filename: str) -> bool: """ Determine if a file is hidden. Hidden files start with '.' or '._'. """ - return filename.startswith('.') or filename.startswith('._') + return filename.startswith(".") or filename.startswith("._") def create_experiment_folder_command(self, process: ProcessNode) -> str: """ @@ -312,7 +360,9 @@ def create_experiment_folder_command(self, process: ProcessNode) -> str: folder_name = f"process_{id(ProcessNode)}" return f"mkdir -p {folder_name}" - def copy_input_files_command(self, process: ProcessNode, inputs: List[FileNode]) -> Optional[str]: + def copy_input_files_command( + self, process: ProcessNode, inputs: List[FileNode] + ) -> Optional[str]: """ Generate the command to copy input files into the experiment folder. Returns None if there are no input files. @@ -327,7 +377,7 @@ def copy_input_files_command(self, process: ProcessNode, inputs: List[FileNode]) continue # Skip hidden files escaped_file = self.escape_filename_for_makefile(file.label) commands.append(f"cp {escaped_file} {folder_name}/") - + if commands: return "\n\t".join(commands) return None @@ -346,26 +396,32 @@ def run_command_command(self, process: ProcessNode, outputs: List[FileNode]) -> # Execute command within the folder return f"(cd {folder_name} && {cmd})" - def handle_process_node(self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode]) -> None: + def handle_process_node( + self, process: ProcessNode, inputs: List[FileNode], outputs: List[FileNode] + ) -> None: """ Generate all necessary Makefile commands for a given process node. Handles different cases based on presence of inputs and outputs. """ # Create experiment folder - self.makefile_commands.append(f"# Process {id(ProcessNode)}: {' '.join(process.cmd)}") + self.makefile_commands.append( + f"# Process {id(ProcessNode)}: {' '.join(process.cmd)}" + ) self.makefile_commands.append(f"\tmkdir -p process_{id(ProcessNode)}") - + # Copy input files copy_inputs = self.copy_input_files_command(process, inputs) if copy_inputs: - self.makefile_commands.append(f"# Copy input files for process {id(ProcessNode)}") + self.makefile_commands.append( + f"# Copy input files for process {id(ProcessNode)}" + ) self.makefile_commands.append(f"\t{copy_inputs}") - + # Run the command self.makefile_commands.append(f"# Run command for process {id(ProcessNode)}") run_cmd = self.run_command_command(process, outputs) self.makefile_commands.append(f"\t{run_cmd}") - + # No copying of output files since they are inside the folder def create_rules(self) -> None: @@ -374,13 +430,17 @@ def create_rules(self) -> None: """ # Ensure the output directory exists self.makefile_commands.append(f"\tmkdir -p {self.output_dir}\n") - + # Traverse the graph in topological order to respect dependencies for node in nx.topological_sort(self.graph): if isinstance(node, ProcessNode): - inputs = [n for n in self.graph.predecessors(node) if isinstance(n, FileNode)] - outputs = [n for n in self.graph.successors(node) if isinstance(n, FileNode)] - + inputs = [ + n for n in self.graph.predecessors(node) if isinstance(n, FileNode) + ] + outputs = [ + n for n in self.graph.successors(node) if isinstance(n, FileNode) + ] + self.handle_process_node(node, inputs, outputs) def generate_makefile(self, graph: nx.DiGraph) -> str: @@ -389,7 +449,7 @@ def generate_makefile(self, graph: nx.DiGraph) -> str: """ self.graph = graph self.create_rules() - + # Assemble the Makefile makefile = [] makefile.append("all:") @@ -397,5 +457,5 @@ def generate_makefile(self, graph: nx.DiGraph) -> str: # Ensure each command line is properly indented with a tab # Makefile syntax requires tabs, not spaces makefile.append(f"\t{command}") - + return "\n".join(makefile) diff --git a/probe_src/python/pyproject.toml b/probe_py/pyproject.toml similarity index 71% rename from probe_src/python/pyproject.toml rename to probe_py/pyproject.toml index 10af7eaf..ad019ea9 100644 --- a/probe_src/python/pyproject.toml +++ b/probe_py/pyproject.toml @@ -3,17 +3,20 @@ requires = ["flit_core >=3.2,<4"] build-backend = "flit_core.buildapi" [project] -name = "probe_py.manual" +name = "probe_py" +version = "0.1.0" +description = "Python library and CLI extensions for PROBE" authors = [ {name = "Samuel Grayson", email = "sam@samgrayson.me"}, {name = "Shofiya Bootwala"}, {name = "Saleha Muzammil"}, + {name = "Asif Zubayer Palak"}, + {name = "Kyrillos Ishak"}, + {name = "Jenna Fligor"}, ] license = {file = "LICENSE"} classifiers = ["License :: OSI Approved :: MIT License"] -dynamic = ["version", "description"] dependencies = [ - "probe_py.generated", "networkx", "pydot", "rich", diff --git a/probe_src/tests/test_graph.py b/probe_py/tests/test_graph.py similarity index 62% rename from probe_src/tests/test_graph.py rename to probe_py/tests/test_graph.py index 60632cb9..c4e76734 100644 --- a/probe_src/tests/test_graph.py +++ b/probe_py/tests/test_graph.py @@ -1,8 +1,18 @@ import pytest import typing -from probe_py.generated.parser import ProvLog, parse_probe_log -from probe_py.generated.ops import OpenOp, CloneOp, ExecOp, InitProcessOp, InitExecEpochOp, CloseOp, WaitOp, Op -from probe_py.manual.analysis import provlog_to_digraph, validate_hb_graph +from probe_py.parser import parse_probe_log +from probe_py.ptypes import ProvLog +from probe_py.ops import ( + OpenOp, + CloneOp, + ExecOp, + InitProcessOp, + InitExecEpochOp, + CloseOp, + WaitOp, + Op, +) +from probe_py.analysis import provlog_to_digraph, validate_hb_graph import pathlib import networkx as nx # type: ignore import subprocess @@ -17,7 +27,7 @@ def test_diff_cmd() -> None: paths = [str(project_root / "flake.nix"), str(project_root / "flake.lock")] - command = ['diff', *paths] + command = ["diff", *paths] process_tree_prov_log = execute_command(command, 1) process_graph = provlog_to_digraph(process_tree_prov_log) assert not validate_hb_graph(process_tree_prov_log, process_graph) @@ -27,49 +37,77 @@ def test_diff_cmd() -> None: def test_bash_in_bash() -> None: - command = ["bash", "-c", f"head {project_root}/flake.nix ; head {project_root}/flake.lock"] + command = [ + "bash", + "-c", + f"head {project_root}/flake.nix ; head {project_root}/flake.lock", + ] process_tree_prov_log = execute_command(command) process_graph = provlog_to_digraph(process_tree_prov_log) assert not validate_hb_graph(process_tree_prov_log, process_graph) - paths = [f'{project_root}/flake.nix'.encode(), f'{project_root}/flake.lock'.encode()] + paths = [ + f"{project_root}/flake.nix".encode(), + f"{project_root}/flake.lock".encode(), + ] process_file_map = {} start_node = [node for node, degree in process_graph.in_degree() if degree == 0][0] - dfs_edges = list(nx.dfs_edges(process_graph,source=start_node)) + dfs_edges = list(nx.dfs_edges(process_graph, source=start_node)) parent_process_id = dfs_edges[0][0][0] process_file_map[f"{project_root}/flake.lock".encode()] = parent_process_id process_file_map[f"{project_root}/flake.nix".encode()] = parent_process_id - check_for_clone_and_open(dfs_edges, process_tree_prov_log, 1, process_file_map, paths) + check_for_clone_and_open( + dfs_edges, process_tree_prov_log, 1, process_file_map, paths + ) + def test_bash_in_bash_pipe() -> None: command = ["bash", "-c", f"head {project_root}/flake.nix | tail"] process_tree_prov_log = execute_command(command) process_graph = provlog_to_digraph(process_tree_prov_log) assert not validate_hb_graph(process_tree_prov_log, process_graph) - paths = [f'{project_root}/flake.nix'.encode(), b'stdout'] + paths = [f"{project_root}/flake.nix".encode(), b"stdout"] start_node = [node for node, degree in process_graph.in_degree() if degree == 0][0] - dfs_edges = list(nx.dfs_edges(process_graph,source=start_node)) + dfs_edges = list(nx.dfs_edges(process_graph, source=start_node)) check_for_clone_and_open(dfs_edges, process_tree_prov_log, len(paths), {}, paths) @pytest.mark.xfail def test_pthreads() -> None: - process_tree_prov_log = execute_command([f"{project_root}/probe_src/tests/c/createFile.exe"]) + process_tree_prov_log = execute_command( + [f"{project_root}/probe_src/tests/c/createFile.exe"] + ) process_graph = provlog_to_digraph(process_tree_prov_log) assert not validate_hb_graph(process_tree_prov_log, process_graph) - root_node = [n for n in process_graph.nodes() if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0][0] - bfs_nodes = [node for layer in nx.bfs_layers(process_graph, root_node) for node in layer] - root_node = [n for n in process_graph.nodes() if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0][0] - dfs_edges = list(nx.dfs_edges(process_graph,source=root_node)) + root_node = [ + n + for n in process_graph.nodes() + if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0 + ][0] + bfs_nodes = [ + node for layer in nx.bfs_layers(process_graph, root_node) for node in layer + ] + root_node = [ + n + for n in process_graph.nodes() + if process_graph.out_degree(n) > 0 and process_graph.in_degree(n) == 0 + ][0] + dfs_edges = list(nx.dfs_edges(process_graph, source=root_node)) total_pthreads = 3 - paths = [b'/tmp/0.txt', b'/tmp/1.txt', b'/tmp/2.txt'] - check_pthread_graph(bfs_nodes, dfs_edges, process_tree_prov_log, total_pthreads, paths) - + paths = [b"/tmp/0.txt", b"/tmp/1.txt", b"/tmp/2.txt"] + check_pthread_graph( + bfs_nodes, dfs_edges, process_tree_prov_log, total_pthreads, paths + ) + + def execute_command(command: list[str], return_code: int = 0) -> ProvLog: input = pathlib.Path("probe_log") if input.exists(): input.unlink() result = subprocess.run( - ['probe', 'record'] + (["--debug"] if DEBUG_LIBPROBE else []) + (["--make"] if REMAKE_LIBPROBE else []) + command, + ["probe", "record"] + + (["--debug"] if DEBUG_LIBPROBE else []) + + (["--make"] if REMAKE_LIBPROBE else []) + + command, capture_output=True, text=True, check=False, @@ -86,11 +124,11 @@ def execute_command(command: list[str], return_code: int = 0) -> ProvLog: def check_for_clone_and_open( - dfs_edges: typing.Sequence[tuple[Node, Node]], - process_tree_prov_log: ProvLog, - number_of_child_process: int, - process_file_map: dict[bytes, int], - paths: list[bytes], + dfs_edges: typing.Sequence[tuple[Node, Node]], + process_tree_prov_log: ProvLog, + number_of_child_process: int, + process_file_map: dict[bytes, int], + paths: list[bytes], ) -> None: # to ensure files which are opened are closed file_descriptors = [] @@ -106,49 +144,55 @@ def check_for_clone_and_open( for edge in dfs_edges: curr_pid, curr_epoch_idx, curr_tid, curr_op_idx = edge[0] - - curr_node_op = get_op_from_provlog(process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx) + + curr_node_op = get_op_from_provlog( + process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx + ) if curr_node_op is not None: curr_node_op_data = curr_node_op.data - if(isinstance(curr_node_op_data,CloneOp)): - next_op = get_op_from_provlog(process_tree_prov_log, edge[1][0], edge[1][1], edge[1][2], edge[1][3]) + if isinstance(curr_node_op_data, CloneOp): + next_op = get_op_from_provlog( + process_tree_prov_log, edge[1][0], edge[1][1], edge[1][2], edge[1][3] + ) if next_op is not None: next_op_data = next_op.data - if isinstance(next_op_data,ExecOp): + if isinstance(next_op_data, ExecOp): assert edge[1][0] == curr_node_op_data.task_id check_child_processes.append(curr_node_op_data.task_id) continue - if isinstance(next_op_data,InitProcessOp): + if isinstance(next_op_data, InitProcessOp): assert edge[1][0] == curr_node_op_data.task_id check_child_processes.append(curr_node_op_data.task_id) continue - if isinstance(next_op_data,CloseOp) and edge[0][0]!=edge[1][0]: + if isinstance(next_op_data, CloseOp) and edge[0][0] != edge[1][0]: assert edge[1][0] == curr_node_op_data.task_id check_child_processes.append(curr_node_op_data.task_id) continue if edge[1][3] == -1: continue - current_child_process+=1 + current_child_process += 1 check_wait.append(curr_node_op_data.task_id) - if len(paths)!=0: - process_file_map[paths[current_child_process-1]] = curr_node_op_data.task_id - elif(isinstance(curr_node_op_data,WaitOp)): + if len(paths) != 0: + process_file_map[paths[current_child_process - 1]] = ( + curr_node_op_data.task_id + ) + elif isinstance(curr_node_op_data, WaitOp): ret_pid = curr_node_op_data.task_id wait_option = curr_node_op_data.options if wait_option == 0: assert ret_pid in check_wait check_wait.remove(ret_pid) - if(isinstance(curr_node_op_data,OpenOp)) and curr_node_op_data.ferrno == 0: + if (isinstance(curr_node_op_data, OpenOp)) and curr_node_op_data.ferrno == 0: file_descriptors.append(curr_node_op_data.fd) path = curr_node_op_data.path.path if path in paths: - if len(process_file_map.keys())!=0: + if len(process_file_map.keys()) != 0: # ensure the right cloned process has OpenOp for the path assert curr_pid == process_file_map[path] - if curr_pid!=parent_process_id: + if curr_pid != parent_process_id: assert curr_pid in check_child_processes check_child_processes.remove(curr_pid) - elif(isinstance(curr_node_op_data,CloseOp)): + elif isinstance(curr_node_op_data, CloseOp): fd = curr_node_op_data.low_fd if fd in reserved_file_descriptors: continue @@ -156,16 +200,18 @@ def check_for_clone_and_open( continue if fd in file_descriptors: file_descriptors.remove(fd) - elif(isinstance(curr_node_op_data,ExecOp)): + elif isinstance(curr_node_op_data, ExecOp): # check if stdout is read in right child process - if(edge[1][3]==-1): + if edge[1][3] == -1: continue - next_init_op = get_op_from_provlog(process_tree_prov_log,curr_pid,1,curr_pid,0) + next_init_op = get_op_from_provlog( + process_tree_prov_log, curr_pid, 1, curr_pid, 0 + ) if next_init_op is not None: next_init_op_data = next_init_op.data assert isinstance(next_init_op_data, InitExecEpochOp) - if next_init_op_data.program_name == b'tail': - assert process_file_map[b'stdout'] == curr_pid + if next_init_op_data.program_name == b"tail": + assert process_file_map[b"stdout"] == curr_pid check_child_processes.remove(curr_pid) # check number of cloneOps @@ -178,23 +224,25 @@ def check_for_clone_and_open( def match_open_and_close_fd( - dfs_edges: typing.Sequence[tuple[Node, Node]], - process_tree_prov_log: ProvLog, - paths: list[bytes], + dfs_edges: typing.Sequence[tuple[Node, Node]], + process_tree_prov_log: ProvLog, + paths: list[bytes], ) -> None: reserved_file_descriptors = [0, 1, 2] file_descriptors = set[int]() for edge in dfs_edges: curr_pid, curr_epoch_idx, curr_tid, curr_op_idx = edge[0] - curr_node_op = get_op_from_provlog(process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx) + curr_node_op = get_op_from_provlog( + process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx + ) if curr_node_op is not None: curr_node_op_data = curr_node_op.data - if(isinstance(curr_node_op_data,OpenOp)): + if isinstance(curr_node_op_data, OpenOp): file_descriptors.add(curr_node_op_data.fd) path = curr_node_op_data.path.path if path in paths: paths.remove(path) - elif(isinstance(curr_node_op_data,CloseOp)): + elif isinstance(curr_node_op_data, CloseOp): fd = curr_node_op_data.low_fd if fd in reserved_file_descriptors: continue @@ -206,12 +254,13 @@ def match_open_and_close_fd( assert len(file_descriptors) == 0 assert len(paths) == 0 + def check_pthread_graph( - bfs_nodes: typing.Sequence[Node], - dfs_edges: typing.Sequence[tuple[Node, Node]], - process_tree_prov_log: ProvLog, - total_pthreads: int, - paths: list[bytes], + bfs_nodes: typing.Sequence[Node], + dfs_edges: typing.Sequence[tuple[Node, Node]], + process_tree_prov_log: ProvLog, + total_pthreads: int, + paths: list[bytes], ) -> None: check_wait = [] process_file_map = {} @@ -219,19 +268,25 @@ def check_pthread_graph( file_descriptors = set[int]() reserved_file_descriptors = [1, 2, 3] edge = dfs_edges[0] - parent_pthread_id = get_op_from_provlog(process_tree_prov_log, edge[0][0], edge[0][1], edge[0][2], edge[0][3]).pthread_id + parent_pthread_id = get_op_from_provlog( + process_tree_prov_log, edge[0][0], edge[0][1], edge[0][2], edge[0][3] + ).pthread_id for edge in dfs_edges: curr_pid, curr_epoch_idx, curr_tid, curr_op_idx = edge[0] - curr_node_op = get_op_from_provlog(process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx) - if(isinstance(curr_node_op.data,CloneOp)): + curr_node_op = get_op_from_provlog( + process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx + ) + if isinstance(curr_node_op.data, CloneOp): if edge[1][2] != curr_tid: - continue + continue check_wait.append(curr_node_op.data.task_id) - if len(paths)!=0: - process_file_map[paths[current_child_process]] = curr_node_op.data.task_id - current_child_process+=1 - if isinstance(curr_node_op.data,WaitOp): + if len(paths) != 0: + process_file_map[paths[current_child_process]] = ( + curr_node_op.data.task_id + ) + current_child_process += 1 + if isinstance(curr_node_op.data, WaitOp): ret_pid = curr_node_op.data.task_id wait_option = curr_node_op.data.options if wait_option == 0: @@ -241,13 +296,18 @@ def check_pthread_graph( assert len(set(bfs_nodes)) == len(bfs_nodes) for node in bfs_nodes: curr_pid, curr_epoch_idx, curr_tid, curr_op_idx = node - curr_node_op = get_op_from_provlog(process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx) - if curr_node_op is not None and (isinstance(curr_node_op.data,OpenOp)): + curr_node_op = get_op_from_provlog( + process_tree_prov_log, curr_pid, curr_epoch_idx, curr_tid, curr_op_idx + ) + if curr_node_op is not None and (isinstance(curr_node_op.data, OpenOp)): file_descriptors.add(curr_node_op.data.fd) path = curr_node_op.data.path.path print("open", curr_tid, curr_node_op.pthread_id, curr_node_op.data.fd) if path in paths: - if len(process_file_map.keys())!=0 and parent_pthread_id!=curr_node_op.pthread_id: + if ( + len(process_file_map.keys()) != 0 + and parent_pthread_id != curr_node_op.pthread_id + ): # ensure the right cloned process has OpenOp for the path assert process_file_map[path] == curr_node_op.pthread_id elif curr_node_op is not None and (isinstance(curr_node_op.data, CloseOp)): @@ -268,13 +328,19 @@ def check_pthread_graph( assert len(process_file_map.items()) == len(paths) assert len(file_descriptors) == 0 + def get_op_from_provlog( - process_tree_prov_log: ProvLog, - pid: int, - exec_epoch_id: int, - tid: int, - op_idx: int, + process_tree_prov_log: ProvLog, + pid: int, + exec_epoch_id: int, + tid: int, + op_idx: int, ) -> Op: if op_idx == -1 or exec_epoch_id == -1: raise ValueError() - return process_tree_prov_log.processes[pid].exec_epochs[exec_epoch_id].threads[tid].ops[op_idx] + return ( + process_tree_prov_log.processes[pid] + .exec_epochs[exec_epoch_id] + .threads[tid] + .ops[op_idx] + ) diff --git a/probe_py/tests/test_ssh_arg_parse.py b/probe_py/tests/test_ssh_arg_parse.py new file mode 100644 index 00000000..c1ecdd11 --- /dev/null +++ b/probe_py/tests/test_ssh_arg_parse.py @@ -0,0 +1,36 @@ +from probe_py.ssh_argparser import parse_ssh_args + +# List of test cases +test_cases = [ + (["-v"], (["-v"], None, [])), + (["-p", "22"], (["-p", "22"], None, [])), + (["-v", "-A", "-q"], (["-v", "-A", "-q"], None, [])), + (["-p", "22", "user@host.com"], (["-p", "22"], "user@host.com", [])), + (["user@host.com", "uptime"], ([], "user@host.com", ["uptime"])), + ( + ["-p", "22", "user@host.com", "ls", "-la"], + (["-p", "22"], "user@host.com", ["ls", "-la"]), + ), + ( + ["-A", "user@host.com", "echo", '"Hello World"'], + (["-A"], "user@host.com", ["echo", '"Hello World"']), + ), + ( + ["-o", "StrictHostKeyChecking=no", "user@host.com"], + (["-o", "StrictHostKeyChecking=no"], "user@host.com", []), + ), + ( + ["-v", "-p", "22", "-A", "user@host.com", "uptime"], + (["-v", "-p", "22", "-A"], "user@host.com", ["uptime"]), + ), +] + + +def run_test_cases() -> None: + for i, (input_args, expected_output) in enumerate(test_cases): + result = parse_ssh_args(input_args) + assert result == expected_output + + +if __name__ == "__main__": + run_test_cases() diff --git a/probe_src/tests/test_workflow.py b/probe_py/tests/test_workflow.py similarity index 68% rename from probe_src/tests/test_workflow.py rename to probe_py/tests/test_workflow.py index 00c6cb35..9f35e631 100644 --- a/probe_src/tests/test_workflow.py +++ b/probe_py/tests/test_workflow.py @@ -1,9 +1,9 @@ import re import pytest import pathlib -import networkx as nx -from probe_py.manual.analysis import FileNode, ProcessNode, InodeOnDevice -from probe_py.manual.workflows import NextflowGenerator +import networkx as nx # type: ignore +from probe_py.analysis import FileNode, ProcessNode, InodeOnDevice +from probe_py.workflows import NextflowGenerator tmpdir = pathlib.Path(__file__).resolve().parent / "tmp" @@ -18,8 +18,8 @@ def test_dataflow_graph_to_nextflow_script() -> None: b_file_path.write_text("This is A.txt") dataflow_graph = nx.DiGraph() - A = FileNode(InodeOnDevice(0,0,0), 0, "A.txt") - B = FileNode(InodeOnDevice(0,0,1), 0, "B.txt") + A = FileNode(InodeOnDevice(0, 0, 0), (0, 0), "A.txt") + B = FileNode(InodeOnDevice(0, 0, 1), (0, 0), "B.txt") W = ProcessNode(0, ("cp", "A.txt", "B.txt")) dataflow_graph.add_nodes_from([A, B], color="red") dataflow_graph.add_nodes_from([W], color="blue") @@ -53,34 +53,35 @@ def test_dataflow_graph_to_nextflow_script() -> None: generator = NextflowGenerator() script = generator.generate_workflow(dataflow_graph) - script = re.sub(r'process_\d+', 'process_*', script) - expected_script = re.sub(r'process_\d+', 'process_*', expected_script) + script = re.sub(r"process_\d+", "process_*", script) + expected_script = re.sub(r"process_\d+", "process_*", expected_script) assert script == expected_script - A = FileNode(InodeOnDevice(0,0,0), 0, "A.txt") - B0 = FileNode(InodeOnDevice(0,0,1), 0, "B.txt") - B1 = FileNode(InodeOnDevice(0,0,1), 1, "B.txt") - C = FileNode(InodeOnDevice(0,0,3), 0, "C.txt") - W = ProcessNode(0,("cp", "A.txt", "B.txt")) - X = ProcessNode(1,("sed", "s/foo/bar/g", "-i", "B.txt")) + A = FileNode(InodeOnDevice(0, 0, 0), (0, 0), "A.txt") + B0 = FileNode(InodeOnDevice(0, 0, 1), (0, 0), "B.txt") + B1 = FileNode(InodeOnDevice(0, 0, 1), (1, 0), "B.txt") + C = FileNode(InodeOnDevice(0, 0, 3), (0, 0), "C.txt") + W = ProcessNode(0, ("cp", "A.txt", "B.txt")) + X = ProcessNode(1, ("sed", "s/foo/bar/g", "-i", "B.txt")) # Note, the filename in FileNode will not always appear in the cmd of ProcessNode! - Y = ProcessNode(2,("analyze", "-i", "-k")) - + Y = ProcessNode(2, ("analyze", "-i", "-k")) example_dataflow_graph = nx.DiGraph() # FileNodes will be red and ProcessNodes will be blue in the visualization # Code can distinguish between the two using isinstance(node, ProcessNode) or likewise with FileNode example_dataflow_graph.add_nodes_from([A, B0, B1, C], color="red") example_dataflow_graph.add_nodes_from([W, X, Y], color="blue") - example_dataflow_graph.add_edges_from([ - (A, W), - (W, B0), - (B0, X), - (X, B1), - (A, Y), - (B1, Y), - (Y, C), - ]) + example_dataflow_graph.add_edges_from( + [ + (A, W), + (W, B0), + (B0, X), + (X, B1), + (A, Y), + (B1, Y), + (Y, C), + ] + ) expected_script = '''nextflow.enable.dsl=2 @@ -143,6 +144,6 @@ def test_dataflow_graph_to_nextflow_script() -> None: generator = NextflowGenerator() script = generator.generate_workflow(example_dataflow_graph) - script = re.sub(r'process_\d+', 'process_*', script) - expected_script = re.sub(r'process_\d+', 'process_*', expected_script) + script = re.sub(r"process_\d+", "process_*", script) + expected_script = re.sub(r"process_\d+", "process_*", expected_script) assert script == expected_script diff --git a/probe_src/.gitignore b/probe_src/.gitignore deleted file mode 100644 index 4e6311be..00000000 --- a/probe_src/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -probe_log -.prov -__pycache__/ diff --git a/probe_src/frontend/frontend.nix b/probe_src/frontend/frontend.nix deleted file mode 100644 index f82fce6e..00000000 --- a/probe_src/frontend/frontend.nix +++ /dev/null @@ -1,164 +0,0 @@ -{ - pkgs, - craneLib, - rust-target, - advisory-db, - system, - python, - lib, -}: rec { - src = lib.cleanSource ./.; - filter = name: type: - !(builtins.any (x: x) [ - (lib.hasSuffix ".nix" name) - (lib.hasPrefix "." (builtins.baseNameOf name)) - ]); - - # Common arguments can be set here to avoid repeating them later - commonArgs = { - inherit src; - strictDeps = true; - - # all the crates in this workspace either use rust-bindgen or depend - # on local crate that does. - nativeBuildInputs = [ - pkgs.rustPlatform.bindgenHook - ]; - - # pygen needs to know where to write the python file - preConfigurePhases = [ - "pygenConfigPhase" - ]; - pygenConfigPhase = '' - export PYGEN_OUTFILE="$(realpath ./python/probe_py/generated/ops.py)" - ''; - - CARGO_BUILD_TARGET = rust-target; - CARGO_BUILD_RUSTFLAGS = "-C target-feature=+crt-static"; - CPATH = ../libprobe/include; - }; - - individualCrateArgs = - commonArgs - // { - # inherit cargoArtifacts; - inherit (craneLib.crateNameFromCargoToml {inherit src;}) version; - # disable tests since we'll run them all via cargo-nextest - doCheck = false; - }; - - packages = rec { - # Build *just* the cargo dependencies (of the entire workspace), - # so we can reuse all of that work (e.g. via cachix) when running in CI - # It is *highly* recommended to use something like cargo-hakari to avoid - # cache misses when building individual top-level-crates - cargoArtifacts = craneLib.buildDepsOnly commonArgs; - - # Build the top-level crates of the workspace as individual derivations. - # This allows consumers to only depend on (and build) only what they need. - # Though it is possible to build the entire workspace as a single derivation, - # so this is left up to you on how to organize things - probe-frontend = craneLib.buildPackage (individualCrateArgs - // { - pname = "probe-frontend"; - cargoExtraArgs = "-p probe_frontend"; - installPhase = '' - cp -r ./python/ $out - cp ./LICENSE $out/LICENSE - ''; - }); - probe-py-generated = let - workspace = (builtins.fromTOML (builtins.readFile ./Cargo.toml)).workspace; - # TODO: Simplify this - # Perhaps by folding the substituteAllFiles into probe-py-generated (upstream) or probe-py-frontend (downstream) - # Could we combine all the packages? - in - python.pkgs.buildPythonPackage rec { - src = pkgs.substituteAllFiles rec { - src = probe-frontend; - files = [ - "./pyproject.toml" - "./LICENSE" - "./probe_py/generated/__init__.py" - "./probe_py/generated/ops.py" - "./probe_py/generated/parser.py" - "./probe_py/generated/py.typed" - ]; - authors = builtins.concatStringsSep "" (builtins.map (match: let - name = builtins.elemAt match 0; - email = builtins.elemAt match 1; - in "\n {name = \"${name}\", email = \"${email}\"},") ( - builtins.map - (author-str: builtins.match "(.+) <(.+)>" author-str) - (workspace.package.authors) - )); - version = workspace.package.version; - }; - pname = "probe_py.generated"; - version = workspace.package.version; - pyproject = true; - build-system = [ - python.pkgs.flit-core - ]; - nativeCheckInputs = [ - python.pkgs.mypy - pkgs.ruff - ]; - # ruff, mypy - checkPhase = '' - runHook preCheck - python -c 'import probe_py.generated' - mypy --strict --package probe_py.generated - runHook postCheck - ''; - }; - - probe-cli = craneLib.buildPackage (individualCrateArgs - // { - pname = "probe-cli"; - cargoExtraArgs = "-p probe_cli"; - }); - probe-macros = craneLib.buildPackage (individualCrateArgs - // { - pname = "probe-macros"; - cargoExtraArgs = "-p probe_macros"; - }); - }; - checks = { - probe-workspace-clippy = craneLib.cargoClippy (commonArgs - // { - inherit (packages) cargoArtifacts; - cargoClippyExtraArgs = "--all-targets -- --deny warnings"; - }); - - probe-workspace-doc = craneLib.cargoDoc (commonArgs - // { - inherit (packages) cargoArtifacts; - }); - - # Check formatting - probe-workspace-fmt = craneLib.cargoFmt { - inherit src; - }; - - # Audit dependencies - probe-workspace-audit = craneLib.cargoAudit { - inherit src advisory-db; - }; - - # Audit licenses - probe-workspace-deny = craneLib.cargoDeny { - inherit src; - }; - - # Run tests with cargo-nextest - # this is why `doCheck = false` on the crate derivations, so as to not - # run the tests twice. - probe-workspace-nextest = craneLib.cargoNextest (commonArgs - // { - inherit (packages) cargoArtifacts; - partitions = 1; - partitionType = "count"; - }); - }; -} diff --git a/probe_src/frontend/python/probe_py/generated/__init__.py b/probe_src/frontend/python/probe_py/generated/__init__.py deleted file mode 100644 index 9f8e34d3..00000000 --- a/probe_src/frontend/python/probe_py/generated/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -Generated code for reading with PROBE logs. - -See https://github.com/charmoniumQ/PROBE -""" - -__version__ = "@version@" diff --git a/probe_src/frontend/python/probe_py/generated/ops.py b/probe_src/frontend/python/probe_py/generated/ops.py deleted file mode 100644 index e4f97fdd..00000000 --- a/probe_src/frontend/python/probe_py/generated/ops.py +++ /dev/null @@ -1,233 +0,0 @@ -# This file was @generated by probe_macros -from __future__ import annotations -import typing -from dataclasses import dataclass - -# https://github.com/torvalds/linux/blob/73e931504f8e0d42978bfcda37b323dbbd1afc08/include/uapi/linux/fcntl.h#L98 -AT_FDCWD: int = -100 - -@dataclass(init=True, frozen=True) -class Timespec: - sec: int - nsec: int - - -@dataclass(init=True, frozen=True) -class StatxTimestamp: - sec: int - nsec: int - - -@dataclass(init=True, frozen=True) -class Timeval: - sec: int - usec: int - - -@dataclass(init=True, frozen=True) -class Rusage: - utime: Timeval - stime: Timeval - maxrss: int - ixrss: int - idrss: int - isrss: int - minflt: int - majflt: int - nswap: int - inblock: int - oublock: int - msgsnd: int - msgrcv: int - nsignals: int - nvcsw: int - nivcsw: int - - -@dataclass(init=True, frozen=True) -class Path: - dirfd_minus_at_fdcwd: int - path: bytes - device_major: int - device_minor: int - inode: int - mtime: StatxTimestamp - ctime: StatxTimestamp - size: int - stat_valid: bool - dirfd_valid: bool - - @property - def dirfd(self) -> int: - return self.dirfd_minus_at_fdcwd + AT_FDCWD - - -@dataclass(init=True, frozen=True) -class InitProcessOp: - pid: int - is_root: bool - cwd: Path - - -@dataclass(init=True, frozen=True) -class InitExecEpochOp: - epoch: int - program_name: bytes - - -@dataclass(init=True, frozen=True) -class InitThreadOp: - tid: int - - -@dataclass(init=True, frozen=True) -class OpenOp: - path: Path - flags: int - mode: int - fd: int - ferrno: int - - -@dataclass(init=True, frozen=True) -class CloseOp: - low_fd: int - high_fd: int - ferrno: int - - -@dataclass(init=True, frozen=True) -class ChdirOp: - path: Path - ferrno: int - - -@dataclass(init=True, frozen=True) -class ExecOp: - path: Path - ferrno: int - argc: int - argv: list[bytes, ] - envc: int - env: list[bytes, ] - - -@dataclass(init=True, frozen=True) -class CloneOp: - flags: int - run_pthread_atfork_handlers: bool - task_type: int - task_id: int - ferrno: int - - -@dataclass(init=True, frozen=True) -class ExitOp: - status: int - run_atexit_handlers: bool - - -@dataclass(init=True, frozen=True) -class AccessOp: - path: Path - mode: int - flags: int - ferrno: int - - -@dataclass(init=True, frozen=True) -class StatResult: - mask: int - nlink: int - uid: int - gid: int - mode: int - ino: int - size: int - blocks: int - blksize: int - atime: StatxTimestamp - btime: StatxTimestamp - ctime: StatxTimestamp - mtime: StatxTimestamp - dev_major: int - dev_minor: int - - -@dataclass(init=True, frozen=True) -class StatOp: - path: Path - flags: int - ferrno: int - stat_result: StatResult - - -@dataclass(init=True, frozen=True) -class ReaddirOp: - dir: Path - child: bytes - all_children: bool - ferrno: int - - -@dataclass(init=True, frozen=True) -class WaitOp: - task_type: int - task_id: int - options: int - status: int - ferrno: int - - -@dataclass(init=True, frozen=True) -class GetRUsageOp: - waitpid_arg: int - getrusage_arg: int - usage: Rusage - ferrno: int - - -@dataclass(init=True, frozen=True) -class ReadLinkOp: - path: Path - resolved: bytes - ferrno: int - - -@dataclass(init=True, frozen=True) -class UpdateMetadataOp: - path: Path - flags: int - metadata: Metadata - ferrno: int - - -@dataclass(init=True, frozen=True) -class Op: - data: OpInternal - time: Timespec - pthread_id: int - iso_c_thread_id: int - - -@dataclass(init=True, frozen=True) -class Mode: - mode: int - - -@dataclass(init=True, frozen=True) -class Ownership: - uid: int - gid: int - - -@dataclass(init=True, frozen=True) -class Times: - is_null: bool - atime: Timeval - mtime: Timeval - - -Metadata: typing.TypeAlias = Mode | Ownership | Times -OpInternal: typing.TypeAlias = InitProcessOp | InitExecEpochOp | InitThreadOp | OpenOp | CloseOp | ChdirOp | ExecOp | CloneOp | ExitOp | AccessOp | StatOp | ReaddirOp | WaitOp | GetRUsageOp | UpdateMetadataOp | ReadLinkOp - diff --git a/probe_src/libprobe/.gitignore b/probe_src/libprobe/.gitignore deleted file mode 100644 index 567609b1..00000000 --- a/probe_src/libprobe/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build/ diff --git a/probe_src/libprobe/generated/libc_hooks.c b/probe_src/libprobe/generated/libc_hooks.c deleted file mode 100644 index e1610207..00000000 --- a/probe_src/libprobe/generated/libc_hooks.c +++ /dev/null @@ -1,2381 +0,0 @@ -void init_function_pointers() -{ - unwrapped_fopen = dlsym(RTLD_NEXT, "fopen"); - unwrapped_freopen = dlsym(RTLD_NEXT, "freopen"); - unwrapped_fclose = dlsym(RTLD_NEXT, "fclose"); - unwrapped_fcloseall = dlsym(RTLD_NEXT, "fcloseall"); - unwrapped_openat = dlsym(RTLD_NEXT, "openat"); - unwrapped_open = dlsym(RTLD_NEXT, "open"); - unwrapped_creat = dlsym(RTLD_NEXT, "creat"); - unwrapped_close = dlsym(RTLD_NEXT, "close"); - unwrapped_close_range = dlsym(RTLD_NEXT, "close_range"); - unwrapped_closefrom = dlsym(RTLD_NEXT, "closefrom"); - unwrapped_dup = dlsym(RTLD_NEXT, "dup"); - unwrapped_dup2 = dlsym(RTLD_NEXT, "dup2"); - unwrapped_dup3 = dlsym(RTLD_NEXT, "dup3"); - unwrapped_fcntl = dlsym(RTLD_NEXT, "fcntl"); - unwrapped_chdir = dlsym(RTLD_NEXT, "chdir"); - unwrapped_fchdir = dlsym(RTLD_NEXT, "fchdir"); - unwrapped_opendir = dlsym(RTLD_NEXT, "opendir"); - unwrapped_fdopendir = dlsym(RTLD_NEXT, "fdopendir"); - unwrapped_readdir = dlsym(RTLD_NEXT, "readdir"); - unwrapped_readdir_r = dlsym(RTLD_NEXT, "readdir_r"); - unwrapped_readdir64 = dlsym(RTLD_NEXT, "readdir64"); - unwrapped_readdir64_r = dlsym(RTLD_NEXT, "readdir64_r"); - unwrapped_closedir = dlsym(RTLD_NEXT, "closedir"); - unwrapped_rewinddir = dlsym(RTLD_NEXT, "rewinddir"); - unwrapped_telldir = dlsym(RTLD_NEXT, "telldir"); - unwrapped_seekdir = dlsym(RTLD_NEXT, "seekdir"); - unwrapped_scandir = dlsym(RTLD_NEXT, "scandir"); - unwrapped_scandir64 = dlsym(RTLD_NEXT, "scandir64"); - unwrapped_scandirat = dlsym(RTLD_NEXT, "scandirat"); - unwrapped_getdents64 = dlsym(RTLD_NEXT, "getdents64"); - unwrapped_ftw = dlsym(RTLD_NEXT, "ftw"); - unwrapped_ftw64 = dlsym(RTLD_NEXT, "ftw64"); - unwrapped_nftw = dlsym(RTLD_NEXT, "nftw"); - unwrapped_nftw64 = dlsym(RTLD_NEXT, "nftw64"); - unwrapped_link = dlsym(RTLD_NEXT, "link"); - unwrapped_linkat = dlsym(RTLD_NEXT, "linkat"); - unwrapped_symlink = dlsym(RTLD_NEXT, "symlink"); - unwrapped_symlinkat = dlsym(RTLD_NEXT, "symlinkat"); - unwrapped_readlink = dlsym(RTLD_NEXT, "readlink"); - unwrapped_readlinkat = dlsym(RTLD_NEXT, "readlinkat"); - unwrapped_canonicalize_file_name = dlsym(RTLD_NEXT, "canonicalize_file_name"); - unwrapped_realpath = dlsym(RTLD_NEXT, "realpath"); - unwrapped_unlink = dlsym(RTLD_NEXT, "unlink"); - unwrapped_rmdir = dlsym(RTLD_NEXT, "rmdir"); - unwrapped_remove = dlsym(RTLD_NEXT, "remove"); - unwrapped_rename = dlsym(RTLD_NEXT, "rename"); - unwrapped_mkdir = dlsym(RTLD_NEXT, "mkdir"); - unwrapped_mkdirat = dlsym(RTLD_NEXT, "mkdirat"); - unwrapped_stat = dlsym(RTLD_NEXT, "stat"); - unwrapped_stat64 = dlsym(RTLD_NEXT, "stat64"); - unwrapped_fstat = dlsym(RTLD_NEXT, "fstat"); - unwrapped_fstat64 = dlsym(RTLD_NEXT, "fstat64"); - unwrapped_lstat = dlsym(RTLD_NEXT, "lstat"); - unwrapped_lstat64 = dlsym(RTLD_NEXT, "lstat64"); - unwrapped_statx = dlsym(RTLD_NEXT, "statx"); - unwrapped_fstatat = dlsym(RTLD_NEXT, "fstatat"); - unwrapped_fstatat64 = dlsym(RTLD_NEXT, "fstatat64"); - unwrapped_chown = dlsym(RTLD_NEXT, "chown"); - unwrapped_fchown = dlsym(RTLD_NEXT, "fchown"); - unwrapped_lchown = dlsym(RTLD_NEXT, "lchown"); - unwrapped_fchownat = dlsym(RTLD_NEXT, "fchownat"); - unwrapped_chmod = dlsym(RTLD_NEXT, "chmod"); - unwrapped_fchmod = dlsym(RTLD_NEXT, "fchmod"); - unwrapped_fchmodat = dlsym(RTLD_NEXT, "fchmodat"); - unwrapped_access = dlsym(RTLD_NEXT, "access"); - unwrapped_faccessat = dlsym(RTLD_NEXT, "faccessat"); - unwrapped_utime = dlsym(RTLD_NEXT, "utime"); - unwrapped_utimes = dlsym(RTLD_NEXT, "utimes"); - unwrapped_lutimes = dlsym(RTLD_NEXT, "lutimes"); - unwrapped_futimes = dlsym(RTLD_NEXT, "futimes"); - unwrapped_truncate = dlsym(RTLD_NEXT, "truncate"); - unwrapped_truncate64 = dlsym(RTLD_NEXT, "truncate64"); - unwrapped_ftruncate = dlsym(RTLD_NEXT, "ftruncate"); - unwrapped_ftruncate64 = dlsym(RTLD_NEXT, "ftruncate64"); - unwrapped_mknod = dlsym(RTLD_NEXT, "mknod"); - unwrapped_tmpfile = dlsym(RTLD_NEXT, "tmpfile"); - unwrapped_tmpfile64 = dlsym(RTLD_NEXT, "tmpfile64"); - unwrapped_tmpnam = dlsym(RTLD_NEXT, "tmpnam"); - unwrapped_tmpnam_r = dlsym(RTLD_NEXT, "tmpnam_r"); - unwrapped_tempnam = dlsym(RTLD_NEXT, "tempnam"); - unwrapped_mktemp = dlsym(RTLD_NEXT, "mktemp"); - unwrapped_mkstemp = dlsym(RTLD_NEXT, "mkstemp"); - unwrapped_mkdtemp = dlsym(RTLD_NEXT, "mkdtemp"); - unwrapped_execv = dlsym(RTLD_NEXT, "execv"); - unwrapped_execl = dlsym(RTLD_NEXT, "execl"); - unwrapped_execve = dlsym(RTLD_NEXT, "execve"); - unwrapped_fexecve = dlsym(RTLD_NEXT, "fexecve"); - unwrapped_execle = dlsym(RTLD_NEXT, "execle"); - unwrapped_execvp = dlsym(RTLD_NEXT, "execvp"); - unwrapped_execlp = dlsym(RTLD_NEXT, "execlp"); - unwrapped_execvpe = dlsym(RTLD_NEXT, "execvpe"); - unwrapped_fork = dlsym(RTLD_NEXT, "fork"); - unwrapped__Fork = dlsym(RTLD_NEXT, "_Fork"); - unwrapped_vfork = dlsym(RTLD_NEXT, "vfork"); - unwrapped_clone = dlsym(RTLD_NEXT, "clone"); - unwrapped_waitpid = dlsym(RTLD_NEXT, "waitpid"); - unwrapped_wait = dlsym(RTLD_NEXT, "wait"); - unwrapped_wait4 = dlsym(RTLD_NEXT, "wait4"); - unwrapped_wait3 = dlsym(RTLD_NEXT, "wait3"); - unwrapped_waitid = dlsym(RTLD_NEXT, "waitid"); - unwrapped_thrd_create = dlsym(RTLD_NEXT, "thrd_create"); - unwrapped_thrd_join = dlsym(RTLD_NEXT, "thrd_join"); - unwrapped_pthread_create = dlsym(RTLD_NEXT, "pthread_create"); - unwrapped_pthread_join = dlsym(RTLD_NEXT, "pthread_join"); - unwrapped_fopen64 = dlsym(RTLD_NEXT, "fopen64"); - unwrapped_freopen64 = dlsym(RTLD_NEXT, "freopen64"); - unwrapped_openat64 = dlsym(RTLD_NEXT, "openat64"); - unwrapped_open64 = dlsym(RTLD_NEXT, "open64"); - unwrapped_create64 = dlsym(RTLD_NEXT, "create64"); -} - -FILE * fopen(const char *filename, const char *opentype) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = fopen_to_flags(opentype), .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - FILE * ret = unwrapped_fopen(filename, opentype); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - op.data.open.ferrno = saved_errno; - } - else - { - op.data.open.fd = fileno(ret); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -FILE * freopen(const char *filename, const char *opentype, FILE *stream) -{ - maybe_init_thread(); - int original_fd = fileno(stream); - struct Op open_op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = fopen_to_flags(opentype), .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - struct Op close_op = {close_op_code, {.close = {original_fd, original_fd, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(open_op); - prov_log_try(close_op); - } - FILE * ret = unwrapped_freopen(filename, opentype, stream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - open_op.data.open.ferrno = saved_errno; - close_op.data.close.ferrno = saved_errno; - } - else - { - open_op.data.open.fd = fileno(ret); - } - prov_log_record(open_op); - prov_log_record(close_op); - } - errno = saved_errno; - return ret; -} - -int fclose(FILE *stream) -{ - maybe_init_thread(); - int fd = fileno(stream); - struct Op op = {close_op_code, {.close = {fd, fd, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fclose(stream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.close.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fcloseall() -{ - maybe_init_thread(); - struct Op op = {close_op_code, {.close = {0, INT_MAX, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fcloseall(); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.close.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int openat(int dirfd, const char *filename, int flags, ...) -{ - maybe_init_thread(); - bool has_mode_arg = ((flags & O_CREAT) != 0) || ((flags & __O_TMPFILE) == __O_TMPFILE); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(dirfd, filename, (flags & O_NOFOLLOW) ? (AT_SYMLINK_NOFOLLOW) : (0)), .flags = flags, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - if (has_mode_arg) - { - va_list ap; - va_start(ap, flags); - op.data.open.mode = va_arg(ap, __type_mode_t); - va_end(ap); - } - prov_log_try(op); - } - size_t varargs_size = (((sizeof(dirfd)) + (sizeof(filename))) + (sizeof(flags))) + ((has_mode_arg) ? (sizeof(mode_t)) : (0)); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_openat, __builtin_apply_args(), varargs_size)); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int open(const char *filename, int flags, ...) -{ - maybe_init_thread(); - bool has_mode_arg = ((flags & O_CREAT) != 0) || ((flags & __O_TMPFILE) == __O_TMPFILE); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, (flags & O_NOFOLLOW) ? (AT_SYMLINK_NOFOLLOW) : (0)), .flags = flags, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - if (has_mode_arg) - { - va_list ap; - va_start(ap, flags); - op.data.open.mode = va_arg(ap, __type_mode_t); - va_end(ap); - } - prov_log_try(op); - } - size_t varargs_size = ((sizeof(filename)) + (sizeof(flags))) + ((has_mode_arg) ? (sizeof(mode_t)) : (0)); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_open, __builtin_apply_args(), varargs_size)); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int creat(const char *filename, mode_t mode) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = (O_WRONLY | O_CREAT) | O_TRUNC, .mode = mode, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_creat(filename, mode); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int close(int filedes) -{ - maybe_init_thread(); - struct Op op = {close_op_code, {.close = {filedes, filedes, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_close(filedes); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.close.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int close_range(unsigned int lowfd, unsigned int maxfd, int flags) -{ - maybe_init_thread(); - if (flags != 0) - { - NOT_IMPLEMENTED("I don't know how to handle close_rnage flags yet"); - } - struct Op op = {close_op_code, {.close = {lowfd, maxfd, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_close_range(lowfd, maxfd, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.close.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -void closefrom(int lowfd) -{ - maybe_init_thread(); - struct Op op = {close_op_code, {.close = {lowfd, INT_MAX, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - unwrapped_closefrom(lowfd); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - prov_log_record(op); - } - errno = saved_errno; -} - -int dup(int old) -{ - maybe_init_thread(); - int ret = unwrapped_dup(old); - return ret; -} - -int dup2(int old, int new) -{ - maybe_init_thread(); - int ret = unwrapped_dup2(old, new); - return ret; -} - -int dup3(int old, int new, int flags) -{ - maybe_init_thread(); - int ret = unwrapped_dup3(old, new, flags); - return ret; -} - -int fcntl(int filedes, int command, ...) -{ - maybe_init_thread(); - bool int_arg = (((((((((command == F_DUPFD) || (command == F_DUPFD_CLOEXEC)) || (command == F_SETFD)) || (command == F_SETFL)) || (command == F_SETOWN)) || (command == F_SETSIG)) || (command == F_SETLEASE)) || (command == F_NOTIFY)) || (command == F_SETPIPE_SZ)) || (command == F_ADD_SEALS); - bool ptr_arg = ((((((((command == F_SETLK) || (command == F_SETLKW)) || (command == F_GETLK)) || (command == F_GETOWN_EX)) || (command == F_SETOWN_EX)) || (command == F_GET_RW_HINT)) || (command == F_SET_RW_HINT)) || (command == F_GET_FILE_RW_HINT)) || (command == F_SET_FILE_RW_HINT); - assert((!int_arg) || (!ptr_arg)); - size_t varargs_size = ((sizeof(filedes)) + (sizeof(command))) + ((int_arg) ? (sizeof(int)) : ((ptr_arg) ? (sizeof(void *)) : (0))); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_fcntl, __builtin_apply_args(), varargs_size)); - return ret; -} - -int chdir(const char *filename) -{ - maybe_init_thread(); - struct Op op = {chdir_op_code, {.chdir = {.path = create_path_lazy(AT_FDCWD, filename, 0), .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_chdir(filename); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.chdir.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fchdir(int filedes) -{ - maybe_init_thread(); - struct Op op = {chdir_op_code, {.chdir = {.path = create_path_lazy(filedes, "", AT_EMPTY_PATH), .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fchdir(filedes); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.chdir.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -DIR * opendir(const char *dirname) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, dirname, 0), .flags = (O_RDONLY | O_DIRECTORY) | O_CLOEXEC, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - DIR * ret = unwrapped_opendir(dirname); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (ret == NULL) ? (errno) : (0); - op.data.open.fd = try_dirfd(ret); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -DIR * fdopendir(int fd) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(fd, "", AT_EMPTY_PATH), .flags = (O_RDONLY | O_DIRECTORY) | O_CLOEXEC, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - DIR * ret = unwrapped_fdopendir(fd); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (ret == NULL) ? (errno) : (0); - op.data.open.fd = try_dirfd(ret); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -struct dirent * readdir(DIR *dirstream) -{ - maybe_init_thread(); - int fd = try_dirfd(dirstream); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(fd, "", AT_EMPTY_PATH), .child = NULL, .all_children = false, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - struct dirent * ret = unwrapped_readdir(dirstream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - op.data.readdir.ferrno = saved_errno; - } - else - { - op.data.readdir.child = arena_strndup(get_data_arena(), ret->d_name, sizeof(ret->d_name)); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int readdir_r(DIR *dirstream, struct dirent *entry, struct dirent **result) -{ - maybe_init_thread(); - int fd = try_dirfd(dirstream); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(fd, "", AT_EMPTY_PATH), .child = NULL, .all_children = false, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_readdir_r(dirstream, entry, result); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if ((*result) == NULL) - { - op.data.readdir.ferrno = saved_errno; - } - else - { - op.data.readdir.child = arena_strndup(get_data_arena(), entry->d_name, sizeof(entry->d_name)); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -struct dirent64 * readdir64(DIR *dirstream) -{ - maybe_init_thread(); - int fd = try_dirfd(dirstream); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(fd, "", AT_EMPTY_PATH), .child = NULL, .all_children = false, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - struct dirent64 * ret = unwrapped_readdir64(dirstream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - op.data.readdir.ferrno = saved_errno; - } - else - { - op.data.readdir.child = arena_strndup(get_data_arena(), ret->d_name, sizeof(ret->d_name)); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int readdir64_r(DIR *dirstream, struct dirent64 *entry, struct dirent64 **result) -{ - maybe_init_thread(); - int fd = try_dirfd(dirstream); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(fd, "", AT_EMPTY_PATH), .child = NULL, .all_children = false, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_readdir64_r(dirstream, entry, result); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if ((*result) == NULL) - { - op.data.readdir.ferrno = saved_errno; - } - else - { - op.data.readdir.child = arena_strndup(get_data_arena(), entry->d_name, sizeof(entry->d_name)); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int closedir(DIR *dirstream) -{ - maybe_init_thread(); - int fd = try_dirfd(dirstream); - struct Op op = {close_op_code, {.close = {fd, fd, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_closedir(dirstream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.close.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -void rewinddir(DIR *dirstream) -{ - maybe_init_thread(); - unwrapped_rewinddir(dirstream); -} - -long int telldir(DIR *dirstream) -{ - maybe_init_thread(); - long int ret = unwrapped_telldir(dirstream); - return ret; -} - -void seekdir(DIR *dirstream, long int pos) -{ - maybe_init_thread(); - unwrapped_seekdir(dirstream, pos); -} - -int scandir(const char *dir, struct dirent ***namelist, int (*selector)(const struct dirent *), int (*cmp)(const struct dirent **, const struct dirent **)) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, dir, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_scandir(dir, namelist, selector, cmp); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int scandir64(const char *dir, struct dirent64 ***namelist, int (*selector)(const struct dirent64 *), int (*cmp)(const struct dirent64 **, const struct dirent64 **)) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, dir, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_scandir64(dir, namelist, selector, cmp); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int scandirat(int dirfd, const char * restrict dirp, struct dirent *** restrict namelist, int (*filter)(const struct dirent *), int (*compar)(const struct dirent **, const struct dirent **)) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(dirfd, dirp, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_scandirat(dirfd, dirp, namelist, filter, compar); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -ssize_t getdents64(int fd, void *buffer, size_t length) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(fd, "", AT_EMPTY_PATH), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - ssize_t ret = unwrapped_getdents64(fd, buffer, length); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int ftw(const char *filename, __ftw_func_t func, int descriptors) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, filename, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_ftw(filename, func, descriptors); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int ftw64(const char *filename, __ftw64_func_t func, int descriptors) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, filename, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_ftw64(filename, func, descriptors); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int nftw(const char *filename, __nftw_func_t func, int descriptors, int flag) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, filename, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_nftw(filename, func, descriptors, flag); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int nftw64(const char *filename, __nftw64_func_t func, int descriptors, int flag) -{ - maybe_init_thread(); - struct Op op = {readdir_op_code, {.readdir = {.dir = create_path_lazy(AT_FDCWD, filename, 0), .child = NULL, .all_children = true}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_nftw64(filename, func, descriptors, flag); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int link(const char *oldname, const char *newname) -{ - maybe_init_thread(); - int ret = unwrapped_link(oldname, newname); - return ret; -} - -int linkat(int oldfd, const char *oldname, int newfd, const char *newname, int flags) -{ - maybe_init_thread(); - int ret = unwrapped_linkat(oldfd, oldname, newfd, newname, flags); - return ret; -} - -int symlink(const char *oldname, const char *newname) -{ - maybe_init_thread(); - int ret = unwrapped_symlink(oldname, newname); - return ret; -} - -int symlinkat(const char *target, int newdirfd, const char *linkpath) -{ - maybe_init_thread(); - int ret = unwrapped_symlinkat(target, newdirfd, linkpath); - return ret; -} - -ssize_t readlink(const char *filename, char *buffer, size_t size) -{ - maybe_init_thread(); - ssize_t ret = unwrapped_readlink(filename, buffer, size); - return ret; -} - -ssize_t readlinkat(int dirfd, const char *filename, char *buffer, size_t size) -{ - maybe_init_thread(); - ssize_t ret = unwrapped_readlinkat(dirfd, filename, buffer, size); - return ret; -} - -char * canonicalize_file_name(const char *name) -{ - maybe_init_thread(); - char * ret = unwrapped_canonicalize_file_name(name); - return ret; -} - -char * realpath(const char * restrict name, char * restrict resolved) -{ - maybe_init_thread(); - char * ret = unwrapped_realpath(name, resolved); - return ret; -} - -int unlink(const char *filename) -{ - maybe_init_thread(); - int ret = unwrapped_unlink(filename); - return ret; -} - -int rmdir(const char *filename) -{ - maybe_init_thread(); - int ret = unwrapped_rmdir(filename); - return ret; -} - -int remove(const char *filename) -{ - maybe_init_thread(); - int ret = unwrapped_remove(filename); - return ret; -} - -int rename(const char *oldname, const char *newname) -{ - maybe_init_thread(); - int ret = unwrapped_rename(oldname, newname); - return ret; -} - -int mkdir(const char *filename, mode_t mode) -{ - maybe_init_thread(); - int ret = unwrapped_mkdir(filename, mode); - return ret; -} - -int mkdirat(int dirfd, const char *pathname, mode_t mode) -{ - maybe_init_thread(); - int ret = unwrapped_mkdirat(dirfd, pathname, mode); - return ret; -} - -int stat(const char *filename, struct stat *buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .ferrno = 0, .stat_result = {0}}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_stat(filename, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int stat64(const char *filename, struct stat64 *buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_stat64(filename, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat64(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fstat(int filedes, struct stat *buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(filedes, "", AT_EMPTY_PATH), .flags = 0, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fstat(filedes, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fstat64(int filedes, struct stat64 * restrict buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(filedes, "", AT_EMPTY_PATH), .flags = 0, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fstat64(filedes, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat64(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int lstat(const char *filename, struct stat *buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(AT_FDCWD, filename, AT_SYMLINK_NOFOLLOW), .flags = AT_SYMLINK_NOFOLLOW, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_lstat(filename, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int lstat64(const char *filename, struct stat64 *buf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(AT_FDCWD, filename, AT_SYMLINK_NOFOLLOW), .flags = AT_SYMLINK_NOFOLLOW, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_lstat64(filename, buf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - else - { - stat_result_from_stat64(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int statx(int dirfd, const char * restrict pathname, int flags, unsigned int mask, struct statx * restrict statxbuf) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(dirfd, pathname, flags), .flags = flags, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_statx(dirfd, pathname, flags, mask, statxbuf); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_statx(&op.data.stat.stat_result, statxbuf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fstatat(int dirfd, const char * restrict pathname, struct stat * restrict buf, int flags) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(dirfd, pathname, flags), .flags = flags, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fstatat(dirfd, pathname, buf, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fstatat64(int fd, const char * restrict file, struct stat64 * restrict buf, int flags) -{ - maybe_init_thread(); - struct Op op = {stat_op_code, {.stat = {.path = create_path_lazy(fd, file, flags), .flags = flags, .stat_result = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fstatat64(fd, file, buf, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.stat.ferrno = saved_errno; - } - else - { - stat_result_from_stat64(&op.data.stat.stat_result, buf); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int chown(const char *filename, uid_t owner, gid_t group) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .kind = MetadataOwnership, .value = {.ownership = {.uid = owner, .gid = group}}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_chown(filename, owner, group); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fchown(int filedes, uid_t owner, gid_t group) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(filedes, "", AT_EMPTY_PATH), .flags = AT_EMPTY_PATH, .kind = MetadataOwnership, .value = {.ownership = {.uid = owner, .gid = group}}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fchown(filedes, owner, group); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int lchown(const char *pathname, uid_t owner, gid_t group) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW), .flags = AT_SYMLINK_NOFOLLOW, .kind = MetadataOwnership, .value = {.ownership = {.uid = owner, .gid = group}}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_lchown(pathname, owner, group); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fchownat(int dirfd, const char *pathname, uid_t owner, gid_t group, int flags) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(dirfd, pathname, flags), .flags = flags, .kind = MetadataOwnership, .value = {.ownership = {.uid = owner, .gid = group}}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fchownat(dirfd, pathname, owner, group, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int chmod(const char *filename, mode_t mode) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .kind = MetadataMode, .value = {.mode = mode}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_chmod(filename, mode); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fchmod(int filedes, mode_t mode) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(filedes, "", AT_EMPTY_PATH), .flags = AT_EMPTY_PATH, .kind = MetadataMode, .value = {.mode = mode}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fchmod(filedes, mode); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(dirfd, pathname, flags), .flags = flags, .kind = MetadataMode, .value = {.mode = mode}, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_fchmodat(dirfd, pathname, mode, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int access(const char *filename, int how) -{ - maybe_init_thread(); - struct Op op = {access_op_code, {.access = {create_path_lazy(AT_FDCWD, filename, 0), how, 0, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_access(filename, how); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.access.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int faccessat(int dirfd, const char *pathname, int mode, int flags) -{ - maybe_init_thread(); - struct Op op = {access_op_code, {.access = {.path = create_path_lazy(dirfd, pathname, 0), .mode = mode, .flags = flags, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_faccessat(dirfd, pathname, mode, flags); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.access.ferrno = (ret == 0) ? (0) : (errno); - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int utime(const char *filename, const struct utimbuf *times) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .kind = MetadataTimes, .value = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (times) - { - op.data.update_metadata.value.times.is_null = false; - op.data.update_metadata.value.times.atime.tv_sec = times->actime; - op.data.update_metadata.value.times.mtime.tv_sec = times->modtime; - } - else - { - op.data.update_metadata.value.times.is_null = true; - } - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_utime(filename, times); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int utimes(const char *filename, const struct timeval tvp[2]) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = 0, .kind = MetadataTimes, .value = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (tvp) - { - op.data.update_metadata.value.times.is_null = false; - op.data.update_metadata.value.times.atime = tvp[0]; - op.data.update_metadata.value.times.mtime = tvp[1]; - } - else - { - op.data.update_metadata.value.times.is_null = true; - } - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_utimes(filename, tvp); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int lutimes(const char *filename, const struct timeval tvp[2]) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(AT_FDCWD, filename, AT_SYMLINK_NOFOLLOW), .flags = AT_SYMLINK_NOFOLLOW, .kind = MetadataTimes, .value = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (tvp) - { - op.data.update_metadata.value.times.is_null = false; - op.data.update_metadata.value.times.atime = tvp[0]; - op.data.update_metadata.value.times.mtime = tvp[1]; - } - else - { - op.data.update_metadata.value.times.is_null = true; - } - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_lutimes(filename, tvp); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int futimes(int fd, const struct timeval tvp[2]) -{ - maybe_init_thread(); - struct Op op = {update_metadata_op_code, {.update_metadata = {.path = create_path_lazy(fd, "", AT_EMPTY_PATH), .flags = AT_EMPTY_PATH, .kind = MetadataTimes, .value = {0}, .ferrno = 0}}, {0}, 0, 0}; - if (tvp) - { - op.data.update_metadata.value.times.is_null = false; - op.data.update_metadata.value.times.atime = tvp[0]; - op.data.update_metadata.value.times.mtime = tvp[1]; - } - else - { - op.data.update_metadata.value.times.is_null = true; - } - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_futimes(fd, tvp); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret != 0) - { - op.data.readdir.ferrno = saved_errno; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int truncate(const char *filename, off_t length) -{ - maybe_init_thread(); - int ret = unwrapped_truncate(filename, length); - return ret; -} - -int truncate64(const char *name, off64_t length) -{ - maybe_init_thread(); - int ret = unwrapped_truncate64(name, length); - return ret; -} - -int ftruncate(int fd, off_t length) -{ - maybe_init_thread(); - int ret = unwrapped_ftruncate(fd, length); - return ret; -} - -int ftruncate64(int id, off64_t length) -{ - maybe_init_thread(); - int ret = unwrapped_ftruncate64(id, length); - return ret; -} - -int mknod(const char *filename, mode_t mode, dev_t dev) -{ - maybe_init_thread(); - int ret = unwrapped_mknod(filename, mode, dev); - return ret; -} - -FILE * tmpfile() -{ - maybe_init_thread(); - FILE * ret = unwrapped_tmpfile(); - return ret; -} - -FILE * tmpfile64() -{ - maybe_init_thread(); - FILE * ret = unwrapped_tmpfile64(); - return ret; -} - -char * tmpnam(char *result) -{ - maybe_init_thread(); - char * ret = unwrapped_tmpnam(result); - return ret; -} - -char * tmpnam_r(char *result) -{ - maybe_init_thread(); - char * ret = unwrapped_tmpnam_r(result); - return ret; -} - -char * tempnam(const char *dir, const char *prefix) -{ - maybe_init_thread(); - char * ret = unwrapped_tempnam(dir, prefix); - return ret; -} - -char * mktemp(char *template) -{ - maybe_init_thread(); - char * ret = unwrapped_mktemp(template); - return ret; -} - -int mkstemp(char *template) -{ - maybe_init_thread(); - int ret = unwrapped_mkstemp(template); - return ret; -} - -char * mkdtemp(char *template) -{ - maybe_init_thread(); - char * ret = unwrapped_mkdtemp(template); - return ret; -} - -int execv(const char *filename, char * const argv[]) -{ - maybe_init_thread(); - size_t argc = 0; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(environ, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = create_path_lazy(0, filename, 0), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - op.data.exec.argc = argc; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execl(const char *filename, const char *arg0, ...) -{ - maybe_init_thread(); - size_t argc = COUNT_NONNULL_VARARGS(arg0); - char **argv = malloc((argc + 1) * (sizeof(char *))); - va_list ap; - va_start(ap, arg0); - for (size_t i = 0; i < argc; ++i) - { - argv[i] = va_arg(ap, __type_charp); - } - - va_end(ap); - argv[argc] = NULL; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(environ, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = create_path_lazy(0, filename, 0), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - free((char **) argv); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execve(const char *filename, char * const argv[], char * const env[]) -{ - maybe_init_thread(); - size_t argc = 0; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(env, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = create_path_lazy(0, filename, 0), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int fexecve(int fd, char * const argv[], char * const env[]) -{ - maybe_init_thread(); - size_t argc = 0; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(env, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = create_path_lazy(fd, "", AT_EMPTY_PATH), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_fexecve(fd, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execle(const char *filename, const char *arg0, ...) -{ - maybe_init_thread(); - size_t argc = COUNT_NONNULL_VARARGS(arg0) - 1; - char **argv = malloc((argc + 1) * (sizeof(char *))); - va_list ap; - va_start(ap, arg0); - for (size_t i = 0; i < argc; ++i) - { - argv[i] = va_arg(ap, __type_charp); - } - - argv[argc] = NULL; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - char **env = va_arg(ap, __type_charpp); - va_end(ap); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(env, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = create_path_lazy(0, filename, 0), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - ERROR("Not implemented; I need to figure out how to update the environment."); - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - free((char **) argv); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execvp(const char *filename, char * const argv[]) -{ - maybe_init_thread(); - char *bin_path = arena_calloc(get_data_arena(), PATH_MAX + 1, sizeof(char)); - bool found = lookup_on_path(filename, bin_path); - size_t argc = 0; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(environ, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = (found) ? (create_path_lazy(0, bin_path, 0)) : (null_path), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execlp(const char *filename, const char *arg0, ...) -{ - maybe_init_thread(); - char *bin_path = arena_calloc(get_data_arena(), PATH_MAX + 1, sizeof(char)); - bool found = lookup_on_path(filename, bin_path); - size_t argc = COUNT_NONNULL_VARARGS(arg0); - char **argv = malloc((argc + 1) * (sizeof(char *))); - va_list ap; - va_start(ap, arg0); - for (size_t i = 0; i < argc; ++i) - { - argv[i] = va_arg(ap, __type_charp); - } - - argv[argc] = NULL; - va_end(ap); - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(environ, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = (found) ? (create_path_lazy(0, bin_path, 0)) : (null_path), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - free((char **) argv); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int execvpe(const char *filename, char * const argv[], char * const envp[]) -{ - maybe_init_thread(); - char *bin_path = arena_calloc(get_data_arena(), PATH_MAX + 1, sizeof(char)); - bool found = lookup_on_path(filename, bin_path); - size_t argc = 0; - char * const *copied_argv = arena_copy_argv(get_data_arena(), argv, &argc); - size_t envc = 0; - char * const *updated_env = update_env_with_probe_vars(envp, &envc); - char * const *copied_updated_env = arena_copy_argv(get_data_arena(), updated_env, &envc); - struct Op op = {exec_op_code, {.exec = {.path = (found) ? (create_path_lazy(0, bin_path, 0)) : (null_path), .ferrno = 0, .argc = argc, .argv = copied_argv, .envc = envc, .env = copied_updated_env}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_execvpe(filename, argv, updated_env); - int saved_errno = errno; - free((char **) updated_env); - if (likely(prov_log_is_enabled())) - { - assert(errno > 0); - op.data.exec.ferrno = saved_errno; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -pid_t fork() -{ - maybe_init_thread(); - struct Op op = {clone_op_code, {.clone = {.flags = 0, .run_pthread_atfork_handlers = true, .task_type = TASK_PID, .task_id = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - pid_t ret = unwrapped_fork(); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - else - if (ret == 0) - { - reinit_process(); - } - else - { - op.data.clone.task_id = ret; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -pid_t _Fork() -{ - maybe_init_thread(); - struct Op op = {clone_op_code, {.clone = {.flags = 0, .run_pthread_atfork_handlers = false, .task_type = TASK_PID, .task_id = 0, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - pid_t ret = unwrapped__Fork(); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - else - if (ret == 0) - { - reinit_process(); - } - else - { - op.data.clone.task_id = ret; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -pid_t vfork() -{ - maybe_init_thread(); - struct Op op = {clone_op_code, {.clone = {.flags = 0, .run_pthread_atfork_handlers = true, .task_type = TASK_PID, .task_id = 0, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - } - else - { - prov_log_save(); - } - int ret = unwrapped_fork(); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - else - if (ret == 0) - { - reinit_process(); - } - else - { - op.data.clone.task_id = ret; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -int clone(fn_ptr_int_void_ptr fn, void *stack, int flags, void *arg, ...) -{ - maybe_init_thread(); - (void) fn; - (void) stack; - (void) arg; - flags = flags & (~CLONE_VFORK); - struct Op op = {clone_op_code, {.clone = {.flags = flags, .run_pthread_atfork_handlers = false, .task_type = (flags & CLONE_THREAD) ? (TASK_TID) : (TASK_PID), .task_id = 0, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - prov_log_save(); - if ((flags & CLONE_THREAD) != (flags & CLONE_VM)) - { - NOT_IMPLEMENTED("I conflate cloning a new thread (resulting in a process with the same PID, new TID) with sharing the memory space. If CLONE_SIGHAND is set, then Linux asserts CLONE_THREAD == CLONE_VM; If it is not set and CLONE_THREAD != CLONE_VM, by a real application, I will consider disentangling the assumptions (required to support this combination)."); - } - } - else - { - prov_log_save(); - } - size_t varargs_size = ((((((sizeof(void *)) + (sizeof(void *))) + (sizeof(int))) + ((COUNT_NONNULL_VARARGS(arg) + 1) * (sizeof(void *)))) + (sizeof(pid_t *))) + (sizeof(void *))) + (sizeof(pid_t *)); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_clone, __builtin_apply_args(), varargs_size)); - int saved_errno = errno; - if (unlikely(ret == (-1))) - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - } - else - if (ret == 0) - { - if (flags & CLONE_THREAD) - { - maybe_init_thread(); - } - else - { - reinit_process(); - } - } - else - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.task_id = ret; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -pid_t waitpid(pid_t pid, int *status_ptr, int options) -{ - maybe_init_thread(); - int status; - if (status_ptr == NULL) - { - status_ptr = &status; - } - struct Op op = {wait_op_code, {.wait = {.task_type = TASK_PID, .task_id = 0, .options = options, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - prov_log_try(op); - pid_t ret = unwrapped_waitpid(pid, status_ptr, options); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.wait.ferrno = saved_errno; - } - else - { - op.data.wait.task_id = ret; - op.data.wait.status = *status_ptr; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -pid_t wait(int *status_ptr) -{ - maybe_init_thread(); - int status; - if (status_ptr == NULL) - { - status_ptr = &status; - } - struct Op op = {wait_op_code, {.wait = {.task_type = TASK_PID, .task_id = -1, .options = 0, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - prov_log_try(op); - pid_t ret = unwrapped_wait(status_ptr); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - op.data.wait.ferrno = saved_errno; - } - else - { - op.data.wait.task_id = ret; - op.data.wait.status = *status_ptr; - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -pid_t wait4(pid_t pid, int *status_ptr, int options, struct rusage *usage) -{ - maybe_init_thread(); - struct Op wait_op = {wait_op_code, {.wait = {.task_type = TASK_TID, .task_id = 0, .options = options, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - prov_log_try(wait_op); - struct Op getrusage_op = {getrusage_op_code, {.getrusage = {.waitpid_arg = pid, .getrusage_arg = 0, .usage = {{0}}, .ferrno = 0}}, {0}, 0, 0}; - if (usage) - { - prov_log_try(getrusage_op); - } - pid_t ret = unwrapped_wait4(pid, status_ptr, options, usage); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - wait_op.data.wait.ferrno = saved_errno; - if (usage) - { - getrusage_op.data.getrusage.ferrno = saved_errno; - } - } - else - { - wait_op.data.wait.task_id = ret; - wait_op.data.wait.status = *status_ptr; - if (usage) - { - memcpy(&getrusage_op.data.getrusage.usage, usage, sizeof(struct rusage)); - } - } - prov_log_record(wait_op); - if (usage) - { - prov_log_record(getrusage_op); - } - } - errno = saved_errno; - return ret; -} - -pid_t wait3(int *status_ptr, int options, struct rusage *usage) -{ - maybe_init_thread(); - struct Op wait_op = {wait_op_code, {.wait = {.task_type = TASK_PID, .task_id = 0, .options = options, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - prov_log_try(wait_op); - struct Op getrusage_op = {getrusage_op_code, {.getrusage = {.waitpid_arg = -1, .getrusage_arg = 0, .usage = {{0}}, .ferrno = 0}}, {0}, 0, 0}; - if (usage) - { - prov_log_try(getrusage_op); - } - pid_t ret = unwrapped_wait3(status_ptr, options, usage); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - wait_op.data.wait.ferrno = saved_errno; - if (usage) - { - getrusage_op.data.getrusage.ferrno = saved_errno; - } - } - else - { - wait_op.data.wait.task_id = ret; - wait_op.data.wait.status = *status_ptr; - if (usage) - { - memcpy(&getrusage_op.data.getrusage.usage, usage, sizeof(struct rusage)); - } - } - prov_log_record(wait_op); - if (usage) - { - prov_log_record(getrusage_op); - } - } - errno = saved_errno; - return ret; -} - -int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options) -{ - maybe_init_thread(); - struct Op wait_op = {wait_op_code, {.wait = {.task_type = TASK_TID, .task_id = 0, .options = options, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - prov_log_try(wait_op); - int ret = unwrapped_waitid(idtype, id, infop, options); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (unlikely(ret == (-1))) - { - wait_op.data.wait.ferrno = saved_errno; - } - else - { - wait_op.data.wait.task_id = infop->si_pid; - wait_op.data.wait.status = infop->si_status; - } - prov_log_record(wait_op); - } - errno = saved_errno; - return ret; -} - -int thrd_create(thrd_t *thr, thrd_start_t func, void *arg) -{ - maybe_init_thread(); - struct Op op = {clone_op_code, {.clone = {.flags = (((((CLONE_FILES | CLONE_FS) | CLONE_IO) | CLONE_PARENT) | CLONE_SIGHAND) | CLONE_THREAD) | CLONE_VM, .task_type = TASK_ISO_C_THREAD, .task_id = 0, .run_pthread_atfork_handlers = false, .ferrno = 0}}, {0}, 0, 0}; - int ret = unwrapped_thrd_create(thr, func, arg); - int saved_errno = errno; - if (unlikely(ret != thrd_success)) - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - } - else - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.task_id = ret; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -int thrd_join(thrd_t thr, int *res) -{ - maybe_init_thread(); - struct Op op = {wait_op_code, {.wait = {.task_type = TASK_ISO_C_THREAD, .task_id = thr, .options = 0, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - int ret = unwrapped_thrd_join(thr, res); - int saved_errno = errno; - if (unlikely(ret != thrd_success)) - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - } - else - { - op.data.wait.status = *res; - if (likely(prov_log_is_enabled())) - { - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -int pthread_create(pthread_t * restrict thread, const pthread_attr_t * restrict attr, void *(*start_routine)(void *), void * restrict arg) -{ - maybe_init_thread(); - struct Op op = {clone_op_code, {.clone = {.flags = (((((CLONE_FILES | CLONE_FS) | CLONE_IO) | CLONE_PARENT) | CLONE_SIGHAND) | CLONE_THREAD) | CLONE_VM, .task_type = TASK_PTHREAD, .task_id = 0, .run_pthread_atfork_handlers = false, .ferrno = 0}}, {0}, 0, 0}; - int ret = unwrapped_pthread_create(thread, attr, start_routine, arg); - int saved_errno = errno; - if (unlikely(ret != 0)) - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - } - else - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.task_id = *thread; - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -int pthread_join(pthread_t thread, void **retval) -{ - maybe_init_thread(); - struct Op op = {wait_op_code, {.wait = {.task_type = TASK_PTHREAD, .task_id = thread, .options = 0, .status = 0, .ferrno = 0}}, {0}, 0, 0}; - int ret = unwrapped_pthread_join(thread, retval); - int saved_errno = errno; - if (unlikely(ret != 0)) - { - if (likely(prov_log_is_enabled())) - { - op.data.clone.ferrno = saved_errno; - prov_log_record(op); - } - } - else - { - if (likely(prov_log_is_enabled())) - { - prov_log_record(op); - } - } - errno = saved_errno; - return ret; -} - -FILE * fopen64(const char *filename, const char *opentype) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = fopen_to_flags(opentype), .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - FILE * ret = unwrapped_fopen64(filename, opentype); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - op.data.open.ferrno = saved_errno; - } - else - { - op.data.open.fd = fileno(ret); - } - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -FILE * freopen64(const char *filename, const char *opentype, FILE *stream) -{ - maybe_init_thread(); - int original_fd = fileno(stream); - struct Op open_op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = fopen_to_flags(opentype), .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - struct Op close_op = {close_op_code, {.close = {original_fd, original_fd, 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(open_op); - prov_log_try(close_op); - } - FILE * ret = unwrapped_freopen64(filename, opentype, stream); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - if (ret == NULL) - { - open_op.data.open.ferrno = saved_errno; - close_op.data.close.ferrno = saved_errno; - } - else - { - open_op.data.open.fd = fileno(ret); - } - prov_log_record(open_op); - prov_log_record(close_op); - } - errno = saved_errno; - return ret; -} - -int openat64(int dirfd, const char *filename, int flags, ...) -{ - maybe_init_thread(); - bool has_mode_arg = ((flags & O_CREAT) != 0) || ((flags & __O_TMPFILE) == __O_TMPFILE); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(dirfd, filename, (flags & O_NOFOLLOW) ? (AT_SYMLINK_NOFOLLOW) : (0)), .flags = flags, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - if (has_mode_arg) - { - va_list ap; - va_start(ap, flags); - op.data.open.mode = va_arg(ap, __type_mode_t); - va_end(ap); - } - prov_log_try(op); - } - size_t varargs_size = (((sizeof(dirfd)) + (sizeof(filename))) + (sizeof(flags))) + ((has_mode_arg) ? (sizeof(mode_t)) : (0)); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_openat64, __builtin_apply_args(), varargs_size)); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int open64(const char *filename, int flags, ...) -{ - maybe_init_thread(); - bool has_mode_arg = ((flags & O_CREAT) != 0) || ((flags & __O_TMPFILE) == __O_TMPFILE); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, (flags & O_NOFOLLOW) ? (AT_SYMLINK_NOFOLLOW) : (0)), .flags = flags, .mode = 0, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - if (has_mode_arg) - { - va_list ap; - va_start(ap, flags); - op.data.open.mode = va_arg(ap, __type_mode_t); - va_end(ap); - } - prov_log_try(op); - } - size_t varargs_size = ((sizeof(filename)) + (sizeof(flags))) + ((has_mode_arg) ? (sizeof(mode_t)) : (0)); - int ret = *((int *) __builtin_apply((void (*)()) unwrapped_open64, __builtin_apply_args(), varargs_size)); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - -int create64(const char *filename, mode_t mode) -{ - maybe_init_thread(); - struct Op op = {open_op_code, {.open = {.path = create_path_lazy(AT_FDCWD, filename, 0), .flags = (O_WRONLY | O_CREAT) | O_TRUNC, .mode = mode, .fd = -1, .ferrno = 0}}, {0}, 0, 0}; - if (likely(prov_log_is_enabled())) - { - prov_log_try(op); - } - int ret = unwrapped_create64(filename, mode); - int saved_errno = errno; - if (likely(prov_log_is_enabled())) - { - op.data.open.ferrno = (unlikely(ret == (-1))) ? (errno) : (0); - op.data.open.fd = ret; - prov_log_record(op); - } - errno = saved_errno; - return ret; -} - diff --git a/probe_src/libprobe/generated/libc_hooks.h b/probe_src/libprobe/generated/libc_hooks.h deleted file mode 100644 index c85c983f..00000000 --- a/probe_src/libprobe/generated/libc_hooks.h +++ /dev/null @@ -1,109 +0,0 @@ -static FILE * (*unwrapped_fopen)(const char *filename, const char *opentype); -static FILE * (*unwrapped_freopen)(const char *filename, const char *opentype, FILE *stream); -static int (*unwrapped_fclose)(FILE *stream); -static int (*unwrapped_fcloseall)(); -static int (*unwrapped_openat)(int dirfd, const char *filename, int flags, ...); -static int (*unwrapped_open)(const char *filename, int flags, ...); -static int (*unwrapped_creat)(const char *filename, mode_t mode); -static int (*unwrapped_close)(int filedes); -static int (*unwrapped_close_range)(unsigned int lowfd, unsigned int maxfd, int flags); -static void (*unwrapped_closefrom)(int lowfd); -static int (*unwrapped_dup)(int old); -static int (*unwrapped_dup2)(int old, int new); -static int (*unwrapped_dup3)(int old, int new, int flags); -static int (*unwrapped_fcntl)(int filedes, int command, ...); -static int (*unwrapped_chdir)(const char *filename); -static int (*unwrapped_fchdir)(int filedes); -static DIR * (*unwrapped_opendir)(const char *dirname); -static DIR * (*unwrapped_fdopendir)(int fd); -static struct dirent * (*unwrapped_readdir)(DIR *dirstream); -static int (*unwrapped_readdir_r)(DIR *dirstream, struct dirent *entry, struct dirent **result); -static struct dirent64 * (*unwrapped_readdir64)(DIR *dirstream); -static int (*unwrapped_readdir64_r)(DIR *dirstream, struct dirent64 *entry, struct dirent64 **result); -static int (*unwrapped_closedir)(DIR *dirstream); -static void (*unwrapped_rewinddir)(DIR *dirstream); -static long int (*unwrapped_telldir)(DIR *dirstream); -static void (*unwrapped_seekdir)(DIR *dirstream, long int pos); -static int (*unwrapped_scandir)(const char *dir, struct dirent ***namelist, int (*selector)(const struct dirent *), int (*cmp)(const struct dirent **, const struct dirent **)); -static int (*unwrapped_scandir64)(const char *dir, struct dirent64 ***namelist, int (*selector)(const struct dirent64 *), int (*cmp)(const struct dirent64 **, const struct dirent64 **)); -static int (*unwrapped_scandirat)(int dirfd, const char * restrict dirp, struct dirent *** restrict namelist, int (*filter)(const struct dirent *), int (*compar)(const struct dirent **, const struct dirent **)); -static ssize_t (*unwrapped_getdents64)(int fd, void *buffer, size_t length); -static int (*unwrapped_ftw)(const char *filename, __ftw_func_t func, int descriptors); -static int (*unwrapped_ftw64)(const char *filename, __ftw64_func_t func, int descriptors); -static int (*unwrapped_nftw)(const char *filename, __nftw_func_t func, int descriptors, int flag); -static int (*unwrapped_nftw64)(const char *filename, __nftw64_func_t func, int descriptors, int flag); -static int (*unwrapped_link)(const char *oldname, const char *newname); -static int (*unwrapped_linkat)(int oldfd, const char *oldname, int newfd, const char *newname, int flags); -static int (*unwrapped_symlink)(const char *oldname, const char *newname); -static int (*unwrapped_symlinkat)(const char *target, int newdirfd, const char *linkpath); -static ssize_t (*unwrapped_readlink)(const char *filename, char *buffer, size_t size); -static ssize_t (*unwrapped_readlinkat)(int dirfd, const char *filename, char *buffer, size_t size); -static char * (*unwrapped_canonicalize_file_name)(const char *name); -static char * (*unwrapped_realpath)(const char * restrict name, char * restrict resolved); -static int (*unwrapped_unlink)(const char *filename); -static int (*unwrapped_rmdir)(const char *filename); -static int (*unwrapped_remove)(const char *filename); -static int (*unwrapped_rename)(const char *oldname, const char *newname); -static int (*unwrapped_mkdir)(const char *filename, mode_t mode); -static int (*unwrapped_mkdirat)(int dirfd, const char *pathname, mode_t mode); -static int (*unwrapped_stat)(const char *filename, struct stat *buf); -static int (*unwrapped_stat64)(const char *filename, struct stat64 *buf); -static int (*unwrapped_fstat)(int filedes, struct stat *buf); -static int (*unwrapped_fstat64)(int filedes, struct stat64 * restrict buf); -static int (*unwrapped_lstat)(const char *filename, struct stat *buf); -static int (*unwrapped_lstat64)(const char *filename, struct stat64 *buf); -static int (*unwrapped_statx)(int dirfd, const char * restrict pathname, int flags, unsigned int mask, struct statx * restrict statxbuf); -static int (*unwrapped_fstatat)(int dirfd, const char * restrict pathname, struct stat * restrict buf, int flags); -static int (*unwrapped_fstatat64)(int fd, const char * restrict file, struct stat64 * restrict buf, int flags); -static int (*unwrapped_chown)(const char *filename, uid_t owner, gid_t group); -static int (*unwrapped_fchown)(int filedes, uid_t owner, gid_t group); -static int (*unwrapped_lchown)(const char *pathname, uid_t owner, gid_t group); -static int (*unwrapped_fchownat)(int dirfd, const char *pathname, uid_t owner, gid_t group, int flags); -static int (*unwrapped_chmod)(const char *filename, mode_t mode); -static int (*unwrapped_fchmod)(int filedes, mode_t mode); -static int (*unwrapped_fchmodat)(int dirfd, const char *pathname, mode_t mode, int flags); -static int (*unwrapped_access)(const char *filename, int how); -static int (*unwrapped_faccessat)(int dirfd, const char *pathname, int mode, int flags); -static int (*unwrapped_utime)(const char *filename, const struct utimbuf *times); -static int (*unwrapped_utimes)(const char *filename, const struct timeval tvp[2]); -static int (*unwrapped_lutimes)(const char *filename, const struct timeval tvp[2]); -static int (*unwrapped_futimes)(int fd, const struct timeval tvp[2]); -static int (*unwrapped_truncate)(const char *filename, off_t length); -static int (*unwrapped_truncate64)(const char *name, off64_t length); -static int (*unwrapped_ftruncate)(int fd, off_t length); -static int (*unwrapped_ftruncate64)(int id, off64_t length); -static int (*unwrapped_mknod)(const char *filename, mode_t mode, dev_t dev); -static FILE * (*unwrapped_tmpfile)(); -static FILE * (*unwrapped_tmpfile64)(); -static char * (*unwrapped_tmpnam)(char *result); -static char * (*unwrapped_tmpnam_r)(char *result); -static char * (*unwrapped_tempnam)(const char *dir, const char *prefix); -static char * (*unwrapped_mktemp)(char *template); -static int (*unwrapped_mkstemp)(char *template); -static char * (*unwrapped_mkdtemp)(char *template); -static int (*unwrapped_execv)(const char *filename, char * const argv[]); -static int (*unwrapped_execl)(const char *filename, const char *arg0, ...); -static int (*unwrapped_execve)(const char *filename, char * const argv[], char * const env[]); -static int (*unwrapped_fexecve)(int fd, char * const argv[], char * const env[]); -static int (*unwrapped_execle)(const char *filename, const char *arg0, ...); -static int (*unwrapped_execvp)(const char *filename, char * const argv[]); -static int (*unwrapped_execlp)(const char *filename, const char *arg0, ...); -static int (*unwrapped_execvpe)(const char *filename, char * const argv[], char * const envp[]); -static pid_t (*unwrapped_fork)(); -static pid_t (*unwrapped__Fork)(); -static pid_t (*unwrapped_vfork)(); -static int (*unwrapped_clone)(fn_ptr_int_void_ptr fn, void *stack, int flags, void *arg, ...); -static pid_t (*unwrapped_waitpid)(pid_t pid, int *status_ptr, int options); -static pid_t (*unwrapped_wait)(int *status_ptr); -static pid_t (*unwrapped_wait4)(pid_t pid, int *status_ptr, int options, struct rusage *usage); -static pid_t (*unwrapped_wait3)(int *status_ptr, int options, struct rusage *usage); -static int (*unwrapped_waitid)(idtype_t idtype, id_t id, siginfo_t *infop, int options); -static int (*unwrapped_thrd_create)(thrd_t *thr, thrd_start_t func, void *arg); -static int (*unwrapped_thrd_join)(thrd_t thr, int *res); -static int (*unwrapped_pthread_create)(pthread_t * restrict thread, const pthread_attr_t * restrict attr, void *(*start_routine)(void *), void * restrict arg); -static int (*unwrapped_pthread_join)(pthread_t thread, void **retval); -static FILE * (*unwrapped_fopen64)(const char *filename, const char *opentype); -static FILE * (*unwrapped_freopen64)(const char *filename, const char *opentype, FILE *stream); -static int (*unwrapped_openat64)(int dirfd, const char *filename, int flags, ...); -static int (*unwrapped_open64)(const char *filename, int flags, ...); -static int (*unwrapped_create64)(const char *filename, mode_t mode); diff --git a/probe_src/libprobe/src/fd_table.c b/probe_src/libprobe/src/fd_table.c deleted file mode 100644 index 0118ae63..00000000 --- a/probe_src/libprobe/src/fd_table.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * This file is responsible for tracking the process-global mapping between - * file-descriptors and file paths. - * Since this is process-global, access must be mediate by the readers/writers lock __fd_table_lock. - * __fd_table is dynamic array of capacity __fd_table_size_factor * i - * */ - -const int __fd_table_size_factor = 1024; -static int __fd_table_capacity = 0; -static OWNED struct { - int dirfd; - int dirfd_version; /* because the directory indicated by dirfd can change, especially if dirfd == AT_FDCWD! */ - int fd; - int version; - OWNED const char* path; - /* struct InodeTriple inode_triple; TODO */ -} * __fd_table = NULL; -static pthread_rwlock_t __fd_table_lock = PTHREAD_RWLOCK_INITIALIZER; - -static int __map_fd(int fd) { - /* - * I want to be able to store AT_FDCWD in the fd_table, but AT_FDCWD is negative... - * So I will just shift the FDs over by one (0..N maps to 1..(N+1), with AT_FDCWD mapping to 0). - * But first, I will rule out -1, which is truly an error value */ - assert(fd != -1); - return (fd == AT_FDCWD) ? 0 : (fd + 1); -} - -static int __unmap_fd(int fd) { - return (fd == 0) ? AT_FDCWD : (fd - 1); -} - -static void __fd_table_ensure_capacity(int mapped_fd) { - if (unlikely(mapped_fd >= __fd_table_capacity)) { - size_t new_fd_table_capacity = __fd_table_size_factor * (mapped_fd / __fd_table_size_factor + 1); - /* This allocation is never freed, because it tracks process-global state. - * It's relatively small, O(max(file descriptor used by tracee)) - * - * Note that recalloc/realloc(NULL, size) is the same as malloc(size). - * - * Note that this has to be zero-initialized, otherwise we won't know which __fd_table entries are populated. - * */ - __fd_table = EXPECT_NONNULL(realloc(__fd_table, new_fd_table_capacity * sizeof(*__fd_table))); - memset(__fd_table + __fd_table_capacity, 0, new_fd_table_capacity - __fd_table_capacity); - - /* Special case going from 0 to n. Must initialize process-global AT_FDCWD */ - if (__fd_table_capacity == 0) { - /* - * Initial AT_FDCWD doesn't need a dirfd; it didn't come from anywhere. - * Usually, the target of an fd is the path relative to the dirfd. - * However, the very initial AT_FDCWD is simply inherited from the working directory of the parent process. - * - * Recording the working directory implies that the execution depends on the value of the working directory, which is not necessarily true. - * For example, the program `cat ./foo-bar` does not depend on the value of the working directory; - * Only $(which cat) and ./foo-bar. - * We can launch that program from any directory containing ./foo-bar provided our system has $(which cat) and the same env. - * This program is "relocatable". - * On the other hand, a program like `realpath .` _does_ depend on the value of the working directory. - * We will only record the working directory in the latter case. - * - * Therefore, we set dirfd = 0 and path = "". - * */ - __fd_table[__map_fd(AT_FDCWD)].dirfd = 0; - __fd_table[__map_fd(AT_FDCWD)].dirfd_version = 0; - __fd_table[__map_fd(AT_FDCWD)].fd = AT_FDCWD; - __fd_table[__map_fd(AT_FDCWD)].version = 0; - __fd_table[__map_fd(AT_FDCWD)].path = strndup("", PATH_MAX); - /* __fd_table[__map_fd(AT_FDCWD)].inode_triple = get_inode_triple(AT_FDCWD, ""); */ - - /* - * Set up default stdin, stderr, stdout - * */ - __fd_table[__map_fd(STDIN_FILENO)].dirfd = AT_FDCWD; - __fd_table[__map_fd(STDIN_FILENO)].dirfd_version = 0; - __fd_table[__map_fd(STDIN_FILENO)].fd = STDIN_FILENO; - __fd_table[__map_fd(STDIN_FILENO)].version = 0; - __fd_table[__map_fd(STDIN_FILENO)].path = strndup("/dev/stdin", PATH_MAX); - /* __fd_table[__map_fd(STDIN_FILENO)].inode_triple = get_inode_triple(AT_FDCWD, "/dev/stdin"); */ - - __fd_table[__map_fd(STDOUT_FILENO)].dirfd = AT_FDCWD; - __fd_table[__map_fd(STDOUT_FILENO)].dirfd_version = 0; - __fd_table[__map_fd(STDOUT_FILENO)].fd = STDOUT_FILENO; - __fd_table[__map_fd(STDOUT_FILENO)].version = 0; - __fd_table[__map_fd(STDOUT_FILENO)].path = strndup("/dev/stdout", PATH_MAX); - /* __fd_table[__map_fd(STDOUT_FILENO)].inode_triple = get_inode_triple(AT_FDCWD, "/dev/stdout"); */ - - __fd_table[__map_fd(STDERR_FILENO)].dirfd = AT_FDCWD; - __fd_table[__map_fd(STDERR_FILENO)].dirfd_version = 0; - __fd_table[__map_fd(STDERR_FILENO)].fd = STDERR_FILENO; - __fd_table[__map_fd(STDERR_FILENO)].version = 0; - __fd_table[__map_fd(STDERR_FILENO)].path = strndup("/dev/stderr", PATH_MAX); - /* __fd_table[__map_fd(STDERR_FILENO)].inode_triple = get_inode_triple(AT_FDCWD, "/dev/stderr"); */ - } - - __fd_table_capacity = new_fd_table_capacity; - } - assert(0 <= mapped_fd && mapped_fd < __fd_table_capacity); -} - -/* - * This is borrowed because the lifetime of normalized path will be bound by the lifetime of Op in the Op buffer - * But the lifetime of our copy of it is bound by the lifetime of fd_table - * */ -static void fd_table_associate(int fd, int dirfd, BORROWED const char* path, struct InodeTriple inode_triple) { - DEBUG("fd_table: %d = openat(%d, \"%s\")", fd, dirfd, path); - fd = __map_fd(fd); - dirfd = __map_fd(dirfd); - EXPECT(== 0, pthread_rwlock_wrlock(&__fd_table_lock)); - __fd_table_ensure_capacity(fd); - /* - * Somehow, __fd_table[fd].path assertion does not always pass. - * This means open returned a file descriptor, that we thought was already used by a previous open. - * Perhaps this can happen across exec boundaries? - * But it seems harmless to ignore this case for now. - * TODO: remove fds that are not marked with CLOEXEC after a successful execve. - * */ - /* assert(!__fd_table[fd].path); */ - /* This allocation is freed by fd_table_close if the tracee properly closes this file or never freed otherwise. - * The tracee would likely run out of FDs if they didn't close their files. */ - __fd_table[fd].path = EXPECT_NONNULL(strndup(path, PATH_MAX)); - __fd_table[fd].dirfd = __unmap_fd(dirfd); - /* Capture dirfd version before doing version++ - * Just in case fd == dirfd, as in chdir("foo") */ - __fd_table[fd].dirfd_version = __fd_table[dirfd].version; - __fd_table[fd].fd = __unmap_fd(fd); - /* __fd_table[fd].inode_triple = inode_triple; */ - __fd_table[fd].version++; - EXPECT(== 0, pthread_rwlock_unlock(&__fd_table_lock)); -} - -static void fd_table_close(int fd) { - DEBUG("fd_table: close(%d /* = openat(%d, \"%s\") */)", fd, __fd_table[__map_fd(fd)].dirfd, __fd_table[__map_fd(fd)].path); - fd = __map_fd(fd); - EXPECT(== 0, pthread_rwlock_wrlock(&__fd_table_lock)); - assert(0 <= fd && fd < __fd_table_capacity && __fd_table[fd].path); - free((char*) __fd_table[fd].path); - __fd_table[fd].path = NULL; - EXPECT(== 0, pthread_rwlock_unlock(&__fd_table_lock)); -} - -static size_t fd_table_size() { - EXPECT(== 0, pthread_rwlock_rdlock(&__fd_table_lock)); - int ret = __fd_table_capacity; - EXPECT(== 0, pthread_rwlock_unlock(&__fd_table_lock)); - return (ret == 0) ? 0 : __unmap_fd(ret); -} - -static bool fd_table_is_used(int fd) { - fd = __map_fd(fd); - EXPECT(== 0, pthread_rwlock_rdlock(&__fd_table_lock)); - assert(0 <= fd && fd < __fd_table_capacity && __fd_table[fd].fd == __unmap_fd(fd)); - bool ret = (bool) __fd_table[fd].path; - EXPECT(== 0, pthread_rwlock_unlock(&__fd_table_lock)); - return ret; -} - -void fd_table_dup(int oldfd, int newfd) { - DEBUG("fd_table: dup2(%d, %d)", oldfd, newfd); - oldfd = __map_fd(oldfd); - newfd = __map_fd(newfd); - EXPECT(== 0, pthread_rwlock_wrlock(&__fd_table_lock)); - assert(0 <= oldfd && oldfd < __fd_table_capacity && __fd_table[oldfd].path && __fd_table[oldfd].fd == __unmap_fd(oldfd)); - assert(0 <= newfd && !__fd_table[newfd].path); - __fd_table_ensure_capacity(newfd); - /* This allocation is freed by fd_table_close if the tracee properly closes this file or never freed otherwise. - * The tracee would likely run out of FDs if they didn't close their files. */ - __fd_table[newfd].path = EXPECT_NONNULL(strndup(__fd_table[oldfd].path, PATH_MAX)); - __fd_table[newfd].dirfd = __fd_table[oldfd].dirfd; - __fd_table[newfd].dirfd_version = __fd_table[oldfd].dirfd_version; - __fd_table[newfd].fd = __unmap_fd(newfd); - /* __fd_table[newfd].inode_triple = __fd_table[oldfd].inode_triple; */ - EXPECT(== 0, pthread_rwlock_unlock(&__fd_table_lock)); -} diff --git a/probe_src/performance_test.py b/probe_src/performance_test.py deleted file mode 100755 index 9e4845c2..00000000 --- a/probe_src/performance_test.py +++ /dev/null @@ -1,171 +0,0 @@ -import subprocess -import shlex -import datetime -import csv -import time -import os -import shutil -import resource -from dataclasses import dataclass -from typing import Any -import errno -from pathlib import Path - -@dataclass -class Result: - returncode: int - stdout: str - stderr: str - duration: float - rusage: resource.struct_rusage - -PROBE_LOG = Path("probe_log") -PROBE_RECORD_DIR = Path("probe_record") - -class ResourcePopen(subprocess.Popen): - def _try_wait(self, wait_flags): - try: - (pid, sts, res) = os.wait4(self.pid, wait_flags) - except OSError as e: - if e.errno != errno.ECHILD: - raise - pid = self.pid - sts = 0 - else: - self.rusage = res - return (pid, sts) - -def resource_call( - *popenargs: Any, - timeout: int | None = None, - **kwargs: Any, -) -> Result: - with ResourcePopen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, **kwargs) as p: - start = datetime.datetime.now() - try: - stdout, stderr = p.communicate(timeout=timeout) - except: - p.kill() - stdout, stderr = p.communicate() - raise - stop = datetime.datetime.now() - return Result(p.returncode, stdout.decode(), stderr.decode(), (stop - start).total_seconds(), p.rusage) - -DELAY = 0.0 - -def cleanup(): - if PROBE_LOG.exists(): - PROBE_LOG.unlink() - if PROBE_RECORD_DIR.exists(): - shutil.rmtree(PROBE_RECORD_DIR) - time.sleep(DELAY) - -def benchmark_command(command: list[str], warmup_iterations: int, benchmark_iterations: int, transcribe_flag: bool) -> list[Result]: - results = [] - - for _ in range(warmup_iterations): - print(f" Running warmup command: {shlex.join(command)}") - cleanup() - proc = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if proc.returncode != 0: - print(" Returned non-zero") - print(proc.stdout.decode()) - print(proc.stderr.decode()) - - for _ in range(benchmark_iterations): - cleanup() - print(f" Running process with command: {shlex.join(command)}") - result = resource_call(command) - if result.returncode != 0: - print(" Returned non-zero") - - results.append(result) - time.sleep(DELAY) - - if transcribe_flag: - print(f" Running probe transcribe -i {PROBE_RECORD_DIR} -o {PROBE_LOG}") - transcribe_result = resource_call(["probe", "transcribe", "-i", str(PROBE_RECORD_DIR), "-o", str(PROBE_LOG)]) - if result.returncode != 0: - print(" Transcribe returned non-zero") - results.append(transcribe_result) - time.sleep(DELAY) - - return results - -def write_results_to_csv(writer, command_to_run, phase, results): - for idx, result in enumerate(results, start=1): - rusage = result.rusage - writer.writerow({ - 'Command': command_to_run, - 'Phase': phase, - 'Return Code': result.returncode, - 'Duration': result.duration, - 'ru_utime': f"{rusage.ru_utime:.6f}", - 'ru_stime': f"{rusage.ru_stime:.6f}", - 'ru_maxrss': rusage.ru_maxrss, - 'ru_ixrss': rusage.ru_ixrss, - 'ru_idrss': rusage.ru_idrss, - 'ru_isrss': rusage.ru_isrss, - 'ru_minflt': rusage.ru_minflt, - 'ru_majflt': rusage.ru_majflt, - 'ru_nswap': rusage.ru_nswap, - 'ru_inblock': rusage.ru_inblock, - 'ru_oublock': rusage.ru_oublock, - 'ru_msgsnd': rusage.ru_msgsnd, - 'ru_msgrcv': rusage.ru_msgrcv, - 'ru_nsignals': rusage.ru_nsignals, - 'ru_nvcsw': rusage.ru_nvcsw, - 'ru_nivcsw': rusage.ru_nivcsw - }) - -def benchmark_with_transcription(commands_to_run: list[list[str]], warmup_count: int, benchmark_count: int): - with open('benchmark_results.csv', mode='w', newline='') as csv_file: - fieldnames = [ - 'Command', 'Phase', 'Return Code', 'Duration', - 'ru_utime', 'ru_stime', 'ru_maxrss', 'ru_ixrss', 'ru_idrss', 'ru_isrss', - 'ru_minflt', 'ru_majflt', 'ru_nswap', 'ru_inblock', 'ru_oublock', - 'ru_msgsnd', 'ru_msgrcv', 'ru_nsignals', 'ru_nvcsw', 'ru_nivcsw' - ] - writer = csv.DictWriter(csv_file, fieldnames=fieldnames) - writer.writeheader() - - for command_args in commands_to_run: - print(f"Benchmarking: {shlex.join(command_args)}") - - - print(f" Running benchmark for command (No PROBE): {shlex.join(command_args)}") - transcribe_flag = False - no_probe_results = benchmark_command(command_args, warmup_count, benchmark_count, transcribe_flag) - write_results_to_csv(writer, shlex.join(command_args), 'No PROBE', no_probe_results) - - cleanup() - - record_command_args = ["probe", "record"] + command_args - print(f" Running benchmark for command (Record): {shlex.join(record_command_args)}") - record_results = benchmark_command(record_command_args, warmup_count, benchmark_count, transcribe_flag) - write_results_to_csv(writer, shlex.join(command_args), 'Record', record_results) - - cleanup() - - transcribe_flag = True - no_transcribe_args= ["probe", "record", "--no-transcribe"] + command_args - print(f" Running benchmark for command probe no-transcribe: {shlex.join(no_transcribe_args)}") - probe_results = benchmark_command(no_transcribe_args, warmup_count, benchmark_count, transcribe_flag) - write_results_to_csv(writer, shlex.join(command_args), 'no-transcribe', probe_results) - - cleanup() - -if __name__ == "__main__": - commands = [ - ["ls", "-l"], - ["echo", "Hello World"], - ["pwd"], - ["sh", "-c", "cd probe_src/tests/c && gcc hello_world.c -o hello_world.exe && ./hello_world.exe"], - ["sh", "-c", "cd probe_src/tests/c && gcc createFile.c -o createFile.exe -lpthread && ./createFile.exe"], - ["python3", "-c", "import sys; sys.stdout.write('hello world')"], - ["date"], - ["uptime"], - ] - - os.chdir(Path(__file__).resolve().parent.parent) - benchmark_with_transcription(commands, warmup_count=1, benchmark_count=4) diff --git a/probe_src/python/README.md b/probe_src/python/README.md deleted file mode 100644 index a68a2987..00000000 --- a/probe_src/python/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# probe_py - -probe_py is a package that implements CLI functionality of PROBE and Python library functionality of PROBE. - -Required reading: diff --git a/probe_src/python/probe_py/manual/graph_utils.py b/probe_src/python/probe_py/manual/graph_utils.py deleted file mode 100644 index 4d629dd7..00000000 --- a/probe_src/python/probe_py/manual/graph_utils.py +++ /dev/null @@ -1,13 +0,0 @@ -import pathlib -import networkx # type: ignore - - -def serialize_graph( - graph: networkx.Graph, - output: pathlib.Path, -) -> None: - pydot_graph = networkx.drawing.nx_pydot.to_pydot(graph) - if output.suffix == "dot": - pydot_graph.write_raw(output) - else: - pydot_graph.write_png(output) diff --git a/probe_src/tests/c/README.md b/probe_src/tests/c/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/probe_src/tests/test_scp_argparse.py b/probe_src/tests/test_scp_argparse.py deleted file mode 100644 index 4ef3b95d..00000000 --- a/probe_src/tests/test_scp_argparse.py +++ /dev/null @@ -1,34 +0,0 @@ -import pathlib -from probe_py.manual.remote_access import Host, HostPath -from probe_py.manual.scp import parse_scp_args - - -def test_parse_scp_args() -> None: - assert parse_scp_args(["test.txt", "host:", "user@host:", "host:test.txt", "user@host:test.txt", "test.txt"]) == ( - [ - HostPath( - Host(network_name=None, username=None, ssh_options=[], scp_options=[]), - path=pathlib.Path("test.txt"), - ), - HostPath( - Host(network_name="host", username=None, ssh_options=[], scp_options=[]), - path=pathlib.Path(), - ), - HostPath( - Host(network_name="host", username="user", ssh_options=[], scp_options=[]), - path=pathlib.Path(), - ), - HostPath( - Host(network_name="host", username=None, ssh_options=[], scp_options=[]), - path=pathlib.Path("test.txt"), - ), - HostPath( - Host(network_name="host", username="user", ssh_options=[], scp_options=[]), - path=pathlib.Path("test.txt"), - ), - ], - HostPath( - host=Host(network_name=None, username=None, ssh_options=[], scp_options=[]), - path=pathlib.Path("test.txt") - ), - ) diff --git a/reproducibility_tests/.gitignore b/reproducibility_tests/.gitignore deleted file mode 100644 index 9a1a6a2d..00000000 --- a/reproducibility_tests/.gitignore +++ /dev/null @@ -1 +0,0 @@ -test_files/ diff --git a/setup_devshell.sh b/setup_devshell.sh index 419703a4..7dea7d4b 100644 --- a/setup_devshell.sh +++ b/setup_devshell.sh @@ -4,34 +4,30 @@ red='\033[0;31m' clr='\033[0m' project_root="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" +printf "project_root = %s\n" "$project_root" # Rust frontend uses CPATH to find libprobe headers -export CPATH="$project_root/probe_src/libprobe/include:$CPATH" +export CPATH="$project_root/libprobe/include:$CPATH" # Rust CLI uses __PROBE_LIB to find libprobe binary -export __PROBE_LIB="$project_root/probe_src/libprobe/build" +export __PROBE_LIB="$project_root/libprobe/build" # Ensure libprobe.so gets maked if [ ! -f "$__PROBE_LIB/libprobe.so" ]; then - echo -e "${red}Please run 'just compile-lib' to compile libprobe${clr}" + printf "%sPlease run 'just compile-lib' to compile libprobe%s\n" "$red" "$clr" fi # Rust code uses PYGEN_OUTFILE to determine where to write this file. -# TODO: Replace this with a static path, because it is never not this path. -export PYGEN_OUTFILE="$project_root/probe_src/frontend/python/probe_py/generated/ops.py" +export PYGEN_OUTFILE="$project_root/probe_py/probe_py/ops.py" # Ensure PROBE CLI gets built -if [ ! -f $project_root/probe_src/frontend/target/release/probe ]; then - echo -e "${red}Please run 'just compile-cli' to compile probe binary${clr}" +if [ ! -f "$project_root/cli-wrapper/target/release/probe" ]; then + printf "%sPlease run 'just compile-cli' to compile probe binary%s\n" "$red" "$clr" fi # Add PROBE CLI to path -export PATH="$project_root/probe_src/frontend/target/release:$PATH" +export PATH="$project_root/cli-wrapper/target/release:$PATH" -# Add probe_py.generated to the Python path -export PYTHONPATH="$project_root/probe_src/frontend/python:$PYTHONPATH" -export MYPYPATH="$project_root/probe_src/frontend/python:$MYPYPATH" - -# Add probe_py.manual to the Python path -export PYTHONPATH="$project_root/probe_src/python:$PYTHONPATH" -export MYPYPATH="$project_root/probe_src/python:$MYPYPATH" +# Add probe_py to the Python path +export PYTHONPATH="$project_root/probe_py/:$PYTHONPATH" +export MYPYPATH="$project_root/probe_py/mypy_stubs:$project_root/probe_py/:$MYPYPATH" diff --git a/test/ssh_wrapper_test/test_ssh_arg_parse.py b/test/ssh_wrapper_test/test_ssh_arg_parse.py deleted file mode 100644 index bf2d335a..00000000 --- a/test/ssh_wrapper_test/test_ssh_arg_parse.py +++ /dev/null @@ -1,30 +0,0 @@ -import sys -from probe_py.manual.ssh_argparser import parse_ssh_args - -# List of test cases -test_cases = [ - (['-v'], (['-v'], None, [])), - (['-p', '22'], (['-p', '22'], None, [])), - (['-v', '-A', '-q'], (['-v', '-A', '-q'], None, [])), - (['-p', '22', 'user@host.com'], (['-p', '22'], 'user@host.com', [])), - (['user@host.com', 'uptime'], ([], 'user@host.com', ['uptime'])), - (['-p', '22', 'user@host.com', 'ls', '-la'], (['-p', '22'], 'user@host.com', ['ls', '-la'])), - (['-A', 'user@host.com', 'echo', '"Hello World"'], (['-A'], 'user@host.com', ['echo', '"Hello World"'])), - (['-o', 'StrictHostKeyChecking=no', 'user@host.com'], (['-o', 'StrictHostKeyChecking=no'], 'user@host.com', [])), - (['-v', '-p', '22', '-A', 'user@host.com', 'uptime'], (['-v', '-p', '22', '-A'], 'user@host.com', ['uptime'])) -] - -def run_test_cases(): - for i, (input_args, expected_output) in enumerate(test_cases): - result = parse_ssh_args(input_args) - if result == expected_output: - print(f"Test case {i+1} passed!") - else: - print(f"Test case {i+1} failed!") - print(f"Input: {input_args}") - print(f"Expected: {expected_output}") - print(f"Got: {result}") - - -if __name__ == "__main__": - run_test_cases() diff --git a/probe_src/tests/.gitignore b/tests/.gitignore similarity index 100% rename from probe_src/tests/.gitignore rename to tests/.gitignore diff --git a/probe_src/tests/c/.gitignore b/tests/examples/.gitignore similarity index 100% rename from probe_src/tests/c/.gitignore rename to tests/examples/.gitignore diff --git a/probe_src/tests/c/Makefile b/tests/examples/Makefile similarity index 100% rename from probe_src/tests/c/Makefile rename to tests/examples/Makefile diff --git a/probe_src/tests/c/createFile.c b/tests/examples/createFile.c similarity index 98% rename from probe_src/tests/c/createFile.c rename to tests/examples/createFile.c index 7aa1d64c..b32f65f0 100644 --- a/probe_src/tests/c/createFile.c +++ b/tests/examples/createFile.c @@ -1,8 +1,8 @@ #include #include #include -#include -#include +#include +#include #include #define NUM_THREADS 3 @@ -123,4 +123,3 @@ int main() { pthread_exit(NULL); } - diff --git a/probe_src/tests/example_df_graph.bash b/tests/examples/example_df_graph.bash similarity index 100% rename from probe_src/tests/example_df_graph.bash rename to tests/examples/example_df_graph.bash diff --git a/probe_src/tests/c/hello_world.c b/tests/examples/hello_world.c similarity index 99% rename from probe_src/tests/c/hello_world.c rename to tests/examples/hello_world.c index 43490cca..60b21937 100644 --- a/probe_src/tests/c/hello_world.c +++ b/tests/examples/hello_world.c @@ -35,4 +35,3 @@ int main() { printf("Main: program exiting.\n"); pthread_exit(NULL); } - diff --git a/probe_src/tests/c/mutex.c b/tests/examples/mutex.c similarity index 100% rename from probe_src/tests/c/mutex.c rename to tests/examples/mutex.c diff --git a/probe_src/tests/c/simple.c b/tests/examples/simple.c similarity index 100% rename from probe_src/tests/c/simple.c rename to tests/examples/simple.c diff --git a/reproducibility_tests/test_determinism.cxx b/tests/examples/test_determinism.cxx similarity index 100% rename from reproducibility_tests/test_determinism.cxx rename to tests/examples/test_determinism.cxx diff --git a/reproducibility_tests/test.sh b/tests/examples/test_determinism.sh similarity index 100% rename from reproducibility_tests/test.sh rename to tests/examples/test_determinism.sh diff --git a/tests/lightweight_env.sh b/tests/lightweight_env.sh new file mode 100755 index 00000000..19d9321b --- /dev/null +++ b/tests/lightweight_env.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# nix develop brings in a ton of stuff to the env +# which complicates testing probe +# To simplify, use this script. + +project_root="$(dirname "$(dirname "$(realpath "${BASH_SOURCE[0]}")")")" + +path="$project_root/cli-wrapper/target/release" + +env - __PROBE_LIB="$__PROBE_LIB" PATH="$path" "${@}" diff --git a/tests/performance_test.py b/tests/performance_test.py new file mode 100755 index 00000000..8e14a84c --- /dev/null +++ b/tests/performance_test.py @@ -0,0 +1,250 @@ +import subprocess +import shlex +import datetime +import csv +import time +import os +import shutil +import resource +import dataclasses +import typing +import errno +import pathlib + + +@dataclasses.dataclass +class Result: + returncode: int + stdout: str + stderr: str + duration: float + rusage: resource.struct_rusage + + +PROBE_LOG = pathlib.Path("probe_log") +PROBE_RECORD_DIR = pathlib.Path("probe_record") + + +class ResourcePopen(subprocess.Popen[bytes]): + def _try_wait(self, wait_flags: int) -> tuple[int, int]: + try: + (pid, sts, res) = os.wait4(self.pid, wait_flags) + except OSError as e: + if e.errno != errno.ECHILD: + raise + pid = self.pid + sts = 0 + else: + self.rusage = res + return (pid, sts) + + +def resource_call( + popenargs: typing.Sequence[str], + timeout: float | None = None, +) -> Result: + with ResourcePopen(popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: + start = datetime.datetime.now() + try: + stdout, stderr = p.communicate(timeout=timeout) + except: + p.kill() + stdout, stderr = p.communicate() + raise + stop = datetime.datetime.now() + return Result( + p.returncode, + stdout.decode(), + stderr.decode(), + (stop - start).total_seconds(), + p.rusage, + ) + + +DELAY = 0.0 + + +def cleanup() -> None: + if PROBE_LOG.exists(): + PROBE_LOG.unlink() + if PROBE_RECORD_DIR.exists(): + shutil.rmtree(PROBE_RECORD_DIR) + time.sleep(DELAY) + + +def benchmark_command( + command: list[str], + warmup_iterations: int, + benchmark_iterations: int, + transcribe_flag: bool, +) -> list[Result]: + results = [] + + for _ in range(warmup_iterations): + print(f" Running warmup command: {shlex.join(command)}") + cleanup() + proc = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + print(" Returned non-zero") + print(proc.stdout.decode()) + print(proc.stderr.decode()) + + for _ in range(benchmark_iterations): + cleanup() + print(f" Running process with command: {shlex.join(command)}") + result = resource_call(command) + if result.returncode != 0: + print(" Returned non-zero") + + results.append(result) + time.sleep(DELAY) + + if transcribe_flag: + print(f" Running probe transcribe -i {PROBE_RECORD_DIR} -o {PROBE_LOG}") + transcribe_result = resource_call( + [ + "probe", + "transcribe", + "-i", + str(PROBE_RECORD_DIR), + "-o", + str(PROBE_LOG), + ] + ) + if result.returncode != 0: + print(" Transcribe returned non-zero") + results.append(transcribe_result) + time.sleep(DELAY) + + return results + + +def write_results_to_csv( + writer: csv.DictWriter[str], + command_to_run: str, + phase: str, + results: list[Result], +) -> None: + for idx, result in enumerate(results, start=1): + rusage = result.rusage + writer.writerow( + { + "Command": command_to_run, + "Phase": phase, + "Return Code": result.returncode, + "Duration": result.duration, + "ru_utime": f"{rusage.ru_utime:.6f}", + "ru_stime": f"{rusage.ru_stime:.6f}", + "ru_maxrss": rusage.ru_maxrss, + "ru_ixrss": rusage.ru_ixrss, + "ru_idrss": rusage.ru_idrss, + "ru_isrss": rusage.ru_isrss, + "ru_minflt": rusage.ru_minflt, + "ru_majflt": rusage.ru_majflt, + "ru_nswap": rusage.ru_nswap, + "ru_inblock": rusage.ru_inblock, + "ru_oublock": rusage.ru_oublock, + "ru_msgsnd": rusage.ru_msgsnd, + "ru_msgrcv": rusage.ru_msgrcv, + "ru_nsignals": rusage.ru_nsignals, + "ru_nvcsw": rusage.ru_nvcsw, + "ru_nivcsw": rusage.ru_nivcsw, + } + ) + + +def benchmark_with_transcription( + commands_to_run: list[list[str]], warmup_count: int, benchmark_count: int +) -> None: + with open("benchmark_results.csv", mode="w", newline="") as csv_file: + fieldnames = [ + "Command", + "Phase", + "Return Code", + "Duration", + "ru_utime", + "ru_stime", + "ru_maxrss", + "ru_ixrss", + "ru_idrss", + "ru_isrss", + "ru_minflt", + "ru_majflt", + "ru_nswap", + "ru_inblock", + "ru_oublock", + "ru_msgsnd", + "ru_msgrcv", + "ru_nsignals", + "ru_nvcsw", + "ru_nivcsw", + ] + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + + for command_args in commands_to_run: + print(f"Benchmarking: {shlex.join(command_args)}") + + print( + f" Running benchmark for command (No PROBE): {shlex.join(command_args)}" + ) + transcribe_flag = False + no_probe_results = benchmark_command( + command_args, warmup_count, benchmark_count, transcribe_flag + ) + write_results_to_csv( + writer, shlex.join(command_args), "No PROBE", no_probe_results + ) + + cleanup() + + record_command_args = ["probe", "record"] + command_args + print( + f" Running benchmark for command (Record): {shlex.join(record_command_args)}" + ) + record_results = benchmark_command( + record_command_args, warmup_count, benchmark_count, transcribe_flag + ) + write_results_to_csv( + writer, shlex.join(command_args), "Record", record_results + ) + + cleanup() + + transcribe_flag = True + no_transcribe_args = ["probe", "record", "--no-transcribe"] + command_args + print( + f" Running benchmark for command probe no-transcribe: {shlex.join(no_transcribe_args)}" + ) + probe_results = benchmark_command( + no_transcribe_args, warmup_count, benchmark_count, transcribe_flag + ) + write_results_to_csv( + writer, shlex.join(command_args), "no-transcribe", probe_results + ) + + cleanup() + + +if __name__ == "__main__": + commands = [ + ["ls", "-l"], + ["echo", "Hello World"], + ["pwd"], + [ + "sh", + "-c", + "cd probe_src/tests/c && gcc hello_world.c -o hello_world.exe && ./hello_world.exe", + ], + [ + "sh", + "-c", + "cd probe_src/tests/c && gcc createFile.c -o createFile.exe -lpthread && ./createFile.exe", + ], + ["python3", "-c", "import sys; sys.stdout.write('hello world')"], + ["date"], + ["uptime"], + ] + + os.chdir(pathlib.Path(__file__).resolve().parent.parent) + benchmark_with_transcription(commands, warmup_count=1, benchmark_count=4) diff --git a/test/ssh_wrapper_test/openssh-server.py b/tests/ssh_wrapper_test/openssh-server.py similarity index 56% rename from test/ssh_wrapper_test/openssh-server.py rename to tests/ssh_wrapper_test/openssh-server.py index 2e48c9e9..38f06854 100644 --- a/test/ssh_wrapper_test/openssh-server.py +++ b/tests/ssh_wrapper_test/openssh-server.py @@ -2,24 +2,30 @@ import threading import os -def run_openssh_server(): + +def run_openssh_server() -> None: os.chdir("./openssh-server/") - process = subprocess.Popen("docker compose up -d", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + process = subprocess.Popen( + "docker compose up -d", + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) - print(''' + print(""" OpenSSH server is up: port = 2222 destination = sshwrapper@localhost Press "q" and Enter to stop the server. - ''') + """) - def wait_for_input(): + def wait_for_input() -> None: while True: - user_input = input() - if user_input.strip().lower() == 'q': + user_input = input() + if user_input.strip().lower() == "q": print("Stopping OpenSSH server...") - process.terminate() - process.wait() + process.terminate() + process.wait() print("OpenSSH server stopped.") break @@ -28,5 +34,6 @@ def wait_for_input(): input_thread.join() + if __name__ == "__main__": run_openssh_server() diff --git a/test/ssh_wrapper_test/openssh-server/docker-compose.yml b/tests/ssh_wrapper_test/openssh-server/docker-compose.yml similarity index 100% rename from test/ssh_wrapper_test/openssh-server/docker-compose.yml rename to tests/ssh_wrapper_test/openssh-server/docker-compose.yml diff --git a/probe_src/tests/docker_os_matrix.py b/tests/test_docker_os_matrix.py similarity index 58% rename from probe_src/tests/docker_os_matrix.py rename to tests/test_docker_os_matrix.py index 24f9edb4..1a31c2e2 100644 --- a/probe_src/tests/docker_os_matrix.py +++ b/tests/test_docker_os_matrix.py @@ -6,6 +6,7 @@ import shlex import pathlib import asyncio +import pytest project_root = pathlib.Path(__file__).resolve().parent.parent.parent @@ -15,35 +16,39 @@ def as_completed_with_concurrency( - n: int, - coros: typing.Iterable[collections.abc.Awaitable[_T]], + n: int, + coros: typing.Iterable[collections.abc.Awaitable[_T]], ) -> typing.Iterator[asyncio.Future[_T]]: semaphore = asyncio.Semaphore(n) + async def sem_coro(coro: collections.abc.Awaitable[_T]) -> _T: async with semaphore: return await coro + return asyncio.as_completed([sem_coro(c) for c in coros]) async def run_in_docker( - name: str, - image: str, - tag: str, - script: list[list[list[str]]], - test: list[list[str]], - capture_output: bool, - clean: bool, + name: str, + image: str, + tag: str, + script: list[list[list[str]]], + test: list[list[str]], + capture_output: bool, + clean: bool, ) -> tuple[str, bool, bytes, bytes]: - dockerfile = "\n".join([ - f"FROM {image}:{tag}", - *[ - "RUN " + " && ".join( - shlex.join(line).replace("double-pipe", "||") - for line in group - ) - for group in script - ], - ]) + dockerfile = "\n".join( + [ + f"FROM {image}:{tag}", + *[ + "RUN " + + " && ".join( + shlex.join(line).replace("double-pipe", "||") for line in group + ) + for group in script + ], + ] + ) temp_dir = pathlib.Path(tempfile.mkdtemp()) (temp_dir / "Dockerfile").write_text(dockerfile) proc = await asyncio.create_subprocess_exec( @@ -61,11 +66,18 @@ async def run_in_docker( if proc.returncode != 0: return name, False, stdout, stderr - test_str = " && ".join( - shlex.join(line) - for line in test - ) - args = ["podman", "run", "--rm", "--volume", f"{project_root}:{project_root}:ro", name, "bash", "-c", test_str] + test_str = " && ".join(shlex.join(line) for line in test) + args = [ + "podman", + "run", + "--rm", + "--volume", + f"{project_root}:{project_root}:ro", + name, + "bash", + "-c", + test_str, + ] proc = await asyncio.create_subprocess_exec( *args, stdin=None, @@ -86,11 +98,17 @@ async def run_in_docker( images = [ - ("ubuntu", ["24.04"], [[ - ["apt-get", "update"], - ["DEBIAN_FRONTEND=noninteractive", "apt-get", "install", "-y", "curl"], - ["rm", "--recursive", "--force", "/var/lib/apt/lists/*"] - ]]), + ( + "ubuntu", + ["24.04"], + [ + [ + ["apt-get", "update"], + ["DEBIAN_FRONTEND=noninteractive", "apt-get", "install", "-y", "curl"], + ["rm", "--recursive", "--force", "/var/lib/apt/lists/*"], + ] + ], + ), # ("debian", ["8.0", "unstable-slim"], [[ # ["apt-get", "update"], # ["DEBIAN_FRONTEND=noninteractive", "apt-get", "install", "-y", "curl"], @@ -105,7 +123,17 @@ async def run_in_docker( script = [ # shlex.quote("|") -> "'|'", which is wrong, so instead we will write the word pipe. [ - ["curl", "--proto", "=https", "--tlsv1.2", "-sSf", "-o", "nix-installer", "-L", "https://install.determinate.systems/nix"], + [ + "curl", + "--proto", + "=https", + "--tlsv1.2", + "-sSf", + "-o", + "nix-installer", + "-L", + "https://install.determinate.systems/nix", + ], ["sh", "nix-installer", "install", "linux", "--no-confirm", "--init", "none"], ], [ @@ -117,7 +145,14 @@ async def run_in_docker( [ ["export", "USER=root"], [".", "/nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh"], - ["nix", "build", "-L", "github:charmoniumQ/PROBE#probe-bundled", "double-pipe", "true"], + [ + "nix", + "build", + "-L", + "github:charmoniumQ/PROBE#probe-bundled", + "double-pipe", + "true", + ], ], ] @@ -125,25 +160,28 @@ async def run_in_docker( ["export", "USER=root"], [".", "/nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh"], ["nix", "profile", "install", "-L", f"{project_root}#probe-bundled"], - ["probe", "record", "-f", "stat", "."] + ["probe", "record", "-f", "stat", "."], ] -async def main(max_concurrency: int, capture_output: bool) -> bool: - results = as_completed_with_concurrency(max_concurrency, [ - run_in_docker( - f"probe-{image}-{tag}", - image, - tag, - pre_script + script, - test, - capture_output, - clean=False, - ) - for image, tags, pre_script in images - for tag in tags - ]) - any_failed = False +@pytest.mark.skip("This test takes a long time") +async def test_docker(max_concurrency: int = 1, capture_output: bool = True) -> None: + results = as_completed_with_concurrency( + max_concurrency, + [ + run_in_docker( + f"probe-{image}-{tag}", + image, + tag, + pre_script + script, + test, + capture_output, + clean=False, + ) + for image, tags, pre_script in images + for tag in tags + ], + ) for result in results: image, success, stdout, stderr = await result if not success: @@ -151,11 +189,4 @@ async def main(max_concurrency: int, capture_output: bool) -> bool: sys.stdout.buffer.write(stdout) sys.stderr.buffer.write(stderr) print("\n") - any_failed = True - return any_failed - -if asyncio.run(main( - max_concurrency=1, - capture_output=False, -)): - sys.exit(1) + assert success, f"{image} failed" diff --git a/tests/test_handoff_to_python.py b/tests/test_handoff_to_python.py new file mode 100644 index 00000000..cece5cfd --- /dev/null +++ b/tests/test_handoff_to_python.py @@ -0,0 +1,8 @@ +import shlex +import subprocess + + +def test_handoff() -> None: + cmd = ["probe", "validate", "--help"] + print(shlex.join(cmd)) + subprocess.run(cmd, check=True) diff --git a/probe_src/tests/test_path_stuff.py b/tests/test_path_stuff.py similarity index 72% rename from probe_src/tests/test_path_stuff.py rename to tests/test_path_stuff.py index 484f6c9d..40d3368d 100644 --- a/probe_src/tests/test_path_stuff.py +++ b/tests/test_path_stuff.py @@ -1,7 +1,4 @@ import shutil -import pytest -import pathlib -import shlex import subprocess @@ -9,8 +6,10 @@ nonexistent_command = "eugrhuerhuliaflsd" -def test_probe_nonexistent_command(): - assert shutil.which(nonexistent_command) is None, "please choose a nonexistent_command" +def test_probe_nonexistent_command() -> None: + assert ( + shutil.which(nonexistent_command) is None + ), "please choose a nonexistent_command" proc = subprocess.run( ["probe", "record", "-f", nonexistent_command], capture_output=True, @@ -20,7 +19,7 @@ def test_probe_nonexistent_command(): assert b"SIGSEGV" not in proc.stderr -def test_probe_empty_path(): +def test_probe_empty_path() -> None: proc = subprocess.run( ["probe", "record", "-f", "env", "PATH=", nonexistent_command], capture_output=True, diff --git a/probe_src/tests/test_integration.py b/tests/test_record.py similarity index 74% rename from probe_src/tests/test_integration.py rename to tests/test_record.py index f7b0d199..17c01ccf 100644 --- a/probe_src/tests/test_integration.py +++ b/tests/test_record.py @@ -10,15 +10,19 @@ def bash(*cmds: str) -> list[str]: - return ["bash", "-c", shlex.join(cmds).replace(" and ", " && ").replace(" redirect_to ", " > ")] + return [ + "bash", + "-c", + shlex.join(cmds).replace(" and ", " && ").replace(" redirect_to ", " > "), + ] commands = [ ["echo", "hi"], - ["head", "../../../flake.nix"], + ["head", "../../flake.nix"], bash( "echo", - "#include \n#include \nint main() {open(\".\", 0); printf(\"hello world\\n\"); return 0; }", + '#include \n#include \nint main() {open(".", 0); printf("hello world\\n"); return 0; }', "redirect_to", "test.c", "and", @@ -46,7 +50,7 @@ def bash(*cmds: str) -> list[str]: ["probe", "record"], ["probe", "record", "--debug"], ["probe", "record", "--copy-files-lazily"], - #["probe", "record", "--copy-files-eagerly"], + # ["probe", "record", "--copy-files-eagerly"], ] @@ -60,13 +64,21 @@ def test_cmds(mode: list[str], command: list[str]) -> None: print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) - cmd = ["probe", "validate", *(["--should-have-files"] if "copy-files" in mode else [])] + cmd = [ + "probe", + "validate", + *(["--should-have-files"] if "copy-files" in mode else []), + ] print(shlex.join(cmd)) if any("gcc" in arg for arg in command): # GCC creates many threads and processes, so this stuff is pretty slow. return + cmd = ["probe", "export", "debug-text"] + print(shlex.join(cmd)) + subprocess.run(cmd, check=True, cwd=tmpdir) + cmd = ["probe", "export", "ops-graph", "test.png"] print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) @@ -76,11 +88,12 @@ def test_cmds(mode: list[str], command: list[str]) -> None: subprocess.run(cmd, check=True, cwd=tmpdir) if "--copy-files" in mode: - cmd = ["probe", "export", "oci-image", "probe-command-test:latest"] print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) - assert shutil.which("podman"), "podman required for this test; should be in the nix flake?" + assert shutil.which( + "podman" + ), "podman required for this test; should be in the nix flake?" cmd = ["podman", "run", "--rm", "probe-command-test:latest"] print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) @@ -88,7 +101,9 @@ def test_cmds(mode: list[str], command: list[str]) -> None: cmd = ["probe", "export", "docker-image", "probe-command-test:latest"] print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) - assert shutil.which("docker"), "podman required for this test; should be in the nix flake?" + assert shutil.which( + "docker" + ), "podman required for this test; should be in the nix flake?" cmd = ["docker", "run", "--rm", "probe-command-test:latest"] print(shlex.join(cmd)) subprocess.run(cmd, check=True, cwd=tmpdir) diff --git a/tests/test_scp_argparse.py b/tests/test_scp_argparse.py new file mode 100644 index 00000000..1f385ece --- /dev/null +++ b/tests/test_scp_argparse.py @@ -0,0 +1,51 @@ +import pathlib +from probe_py.remote_access import Host, HostPath +from probe_py.scp import parse_scp_args + + +def test_parse_scp_args() -> None: + assert parse_scp_args( + [ + "test.txt", + "host:", + "user@host:", + "host:test.txt", + "user@host:test.txt", + "test.txt", + ] + ) == ( + [ + HostPath( + Host(network_name=None, username=None, ssh_options=[], scp_options=[]), + path=pathlib.Path("test.txt"), + ), + HostPath( + Host( + network_name="host", username=None, ssh_options=[], scp_options=[] + ), + path=pathlib.Path(), + ), + HostPath( + Host( + network_name="host", username="user", ssh_options=[], scp_options=[] + ), + path=pathlib.Path(), + ), + HostPath( + Host( + network_name="host", username=None, ssh_options=[], scp_options=[] + ), + path=pathlib.Path("test.txt"), + ), + HostPath( + Host( + network_name="host", username="user", ssh_options=[], scp_options=[] + ), + path=pathlib.Path("test.txt"), + ), + ], + HostPath( + host=Host(network_name=None, username=None, ssh_options=[], scp_options=[]), + path=pathlib.Path("test.txt"), + ), + )